diff --git a/configure b/configure index 25e8cef..1d6c652 100755 --- a/configure +++ b/configure @@ -230,6 +230,7 @@ Advanced options (experts only): --disable-armvfp disable ARM VFP optimizations --disable-iwmmxt disable iwmmxt optimizations --disable-mmi disable MMI optimizations + --disable-loongson2mmi disable LOONGSON2 Multi-Media Instructions usage" --disable-neon disable neon optimizations --disable-vis disable VIS optimizations --disable-yasm disable use of yasm assembler @@ -995,6 +996,7 @@ ARCH_EXT_LIST=' armvfp iwmmxt mmi + loongson2mmi mmx mmx2 neon @@ -2862,6 +2864,7 @@ if enabled arm; then fi if enabled mips; then echo "MMI enabled ${mmi-no}" + echo "LOONGSON2MMI enabled ${loongson2mmi-no}" fi if enabled ppc; then echo "AltiVec enabled ${altivec-no}" diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h index add4b10..8244e51 100644 --- a/libavcodec/avcodec.h +++ b/libavcodec/avcodec.h @@ -1586,6 +1586,8 @@ typedef struct AVCodecContext { #define FF_IDCT_SIMPLENEON 22 #define FF_IDCT_SIMPLEALPHA 23 #define FF_IDCT_BINK 24 +#define FF_IDCT_LIBMPEG2LOONGSON2 25 +#define FF_IDCT_XVIDLOONGSON2 26 /** * slice count diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c index bbfdb6a..dfc3452 100644 --- a/libavcodec/dsputil.c +++ b/libavcodec/dsputil.c @@ -4525,6 +4525,7 @@ av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx) if (HAVE_MMI) dsputil_init_mmi (c, avctx); if (ARCH_SH4) dsputil_init_sh4 (c, avctx); if (ARCH_BFIN) dsputil_init_bfin (c, avctx); + if (HAVE_LOONGSON2MMI) dsputil_init_loongson2(c, avctx); for(i=0; i<64; i++){ if(!c->put_2tap_qpel_pixels_tab[0][i]) diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h index d1816e6..1a72ae9 100644 --- a/libavcodec/dsputil.h +++ b/libavcodec/dsputil.h @@ -636,6 +636,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx); void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx); void dsputil_init_sh4(DSPContext* c, AVCodecContext *avctx); void dsputil_init_vis(DSPContext* c, AVCodecContext *avctx); +void dsputil_init_loongson2(DSPContext* c, AVCodecContext *avctx); void ff_dsputil_init_dwt(DSPContext *c); void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx); diff --git a/libavcodec/loongson2/dsputil_loongson2.c b/libavcodec/loongson2/dsputil_loongson2.c new file mode 100644 index 0000000..01bd3ac --- /dev/null +++ b/libavcodec/loongson2/dsputil_loongson2.c @@ -0,0 +1,221 @@ +/* + * Copyright(C) 2006-2010 comcat + * + * Optimized for Loongson2 CPUs by comcat + * + */ + +#include "dsputil_loongson2.h" +#include "../simple_idct.h" +#include "../mpegvideo.h" + +//extern void ff_idct_xvid_loongson2(short *block); + +extern void ff_loongson2_idct(DCTELEM *block); +extern void ff_idct_xvid_loongson2(short *block); + +static void add_pixels_clamped_loongson2(const DCTELEM *block, uint8_t *restrict pixels, int line_size) +{ + const DCTELEM *p; + uint8_t *pix; + int i,j; + p = block; + pix = pixels; + i = 4; + j = line_size << 1; + __asm __volatile("xor $f14, $f14, $f14\n\t"); + do { + __asm __volatile( +// ".set mips3 \n\t" + "ldc1 $f0, 0(%2) \n\t" + "ldc1 $f2, 8(%2) \n\t" + "ldc1 $f4, 16(%2) \n\t" + "ldc1 $f6, 24(%2) \n\t" + "ldc1 $f8, %0 \n\t" + "ldc1 $f12, %1 \n\t" + "mov.d $f10, $f8 \n\t" + + "punpcklbh $f8, $f8, $f14 \n\t" + "punpckhbh $f10, $f10, $f14\n\t" + + "paddsh $f0, $f0, $f8 \n\t" + "paddsh $f2, $f2, $f10 \n\t" + + "mov.d $f10, $f12 \n\t" + + "punpcklbh $f12, $f12, $f14\n\t" + "punpckhbh $f10, $f10, $f14\n\t" + + "paddsh $f4, $f4, $f12 \n\t" + "paddsh $f6, $f6, $f10 \n\t" + + "packushb $f0, $f0, $f2 \n\t" + "packushb $f4, $f4, $f6 \n\t" + + "sdc1 $f0, %0 \n\t" + "sdc1 $f4, %1 \n\t" +// ".set mips2 \n\t" + :"+m"(*pix), "+m"(*(pix+line_size)) + :"r"(p) + :"$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14","memory"); + pix += j; + p += 16; + } while (--i); + +} + +static void put_pixels_clamped_loongson2(const DCTELEM *block, uint8_t *restrict pixels, int line_size) +{ + const DCTELEM *p; + uint8_t *pix; + int tmp = line_size * 3; + p = block; + pix = pixels; + __asm __volatile + ( +// ".set mips3 \n\t" + //"dadd $12, $0, $0\n\t" + //"dadd $13, $0, $0\n\t" + //"dadd $14, $0, $0\n\t" + + "ldc1 $f0, 0(%3)\n\t" + "ldc1 $f2, 8(%3)\n\t" + "ldc1 $f4, 16(%3)\n\t" + "ldc1 $f6, 24(%3)\n\t" + "ldc1 $f8, 32(%3)\n\t" + "ldc1 $f10, 40(%3)\n\t" + "ldc1 $f16, 48(%3)\n\t" + "ldc1 $f18, 56(%3)\n\t" + + "packushb $f0, $f0, $f2\n\t" + "packushb $f4, $f4, $f6\n\t" + "packushb $f8, $f8, $f10\n\t" + "packushb $f16, $f16, $f18\n\t" + + "add $12, %0, %1\n\t" + "add $13, $12, %1\n\t" + "add $14, %0, %2\n\t" + + "sdc1 $f0, 0(%0)\n\t" + "sdc1 $f4, 0($12)\n\t" + "sdc1 $f8, 0($13)\n\t" + "sdc1 $f16, 0($14)\n\t" +// ".set mips2\n\t" + : + :"r" (pix), "r" (line_size), "r" (tmp), "r"(p) + :"$12","$13","$14","$f0","$f2","$f4","$f6","$f8","$f10","$16","$18" + ); + + pix += line_size*4; + p += 32; + + __asm __volatile + ( +// ".set mips3 \n\t" + + "dadd $12, $0, $0\n\t" + "dadd $13, $0, $0\n\t" + "dadd $14, $0, $0\n\t" + "lw $12, %3\n\t" + + "ldc1 $f0, 0($12)\n\t" + "ldc1 $f2, 8($12)\n\t" + "ldc1 $f4, 16($12)\n\t" + "ldc1 $f6, 24($12)\n\t" + "ldc1 $f8, 32($12)\n\t" + "ldc1 $f10, 40($12)\n\t" + "ldc1 $f16, 48($12)\n\t" + "ldc1 $f18, 56($12)\n\t" + + "packushb $f0, $f0, $f2\n\t" + "packushb $f4, $f4, $f6\n\t" + "packushb $f8, $f8, $f10\n\t" + "packushb $f16, $f16, $f18\n\t" + + "add $12, %1, %0\n\t" + "add $13, $12, %1\n\t" + "add $15, %2, %0\n\t" + + "sdc1 $f0, 0(%0)\n\t" + "sdc1 $f4, 0($12)\n\t" + + "sdc1 $f8, 0($13)\n\t" + "sdc1 $f16, 0($15)\n\t" +// ".set mips2\n\t" + : + :"r" (pix), "r" (line_size), "r" (tmp), "m"(p) + :"$12","$13","$15","$f0","$f2","$f4","$f6","$f8","$f10","$16","$18","memory" + ); + +} + +/* +void put_signed_pixels_clamped_loongson2(const DCTELEM *block, uint8_t *pixels, int line_size) +{ + +} + + +void ff_loongson2_idct_put(uint8_t *dest, int line_size, DCTELEM *block) +{ + ff_loongson2_idct(block); + put_pixels_clamped_loongson2(block, dest, line_size); +} + +void ff_loongson2_idct_add(uint8_t *dest, int line_size, DCTELEM *block) +{ + ff_loongson2_idct(block); + add_pixels_clamped_loongson2(block, dest, line_size); +}*/ + +static void ff_idct_xvid_loongson2_put(uint8_t *dest, int line_size, DCTELEM *block) +{ + ff_idct_xvid_loongson2(block); + put_pixels_clamped_loongson2(block, dest, line_size); +} + +static void ff_idct_xvid_loongson2_add(uint8_t *dest, int line_size, DCTELEM *block) +{ + ff_idct_xvid_loongson2(block); + add_pixels_clamped_loongson2(block, dest, line_size); +} + +void dsputil_init_loongson2(DSPContext *c, AVCodecContext *avctx) +{ + + const int idct_algo = avctx->idct_algo; + +/* +#ifdef CONFIG_ENCODERS + const int dct_algo = avctx->dct_algo; + if(dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_LOONGSON2) + c->fdct = ff_fdct_loongson2; +#endif +*/ + +#if 0 + if(avctx->lowres==0) + { + if(idct_algo == FF_IDCT_LIBMPEG2LOONGSON2) + { + c->idct_add = ff_loongson2_idct_add; + c->idct_put = ff_loongson2_idct_put; + c->idct = ff_loongson2_idct; + } + else if(idct_algo == FF_IDCT_XVIDLOONGSON2) + { +#endif + c->idct_add = ff_idct_xvid_loongson2_add; + c->idct_put = ff_idct_xvid_loongson2_put; + c->idct = ff_idct_xvid_loongson2; + //} + //} + + c->put_pixels_clamped = put_pixels_clamped_loongson2; + c->add_pixels_clamped = add_pixels_clamped_loongson2; + +#ifdef CONFIG_ENCODERS + dsputil_init_pix_loongson2(c, avctx); +#endif + +} diff --git a/libavcodec/loongson2/dsputil_loongson2.d b/libavcodec/loongson2/dsputil_loongson2.d new file mode 100644 index 0000000..808f0a3 --- /dev/null +++ b/libavcodec/loongson2/dsputil_loongson2.d @@ -0,0 +1,18 @@ +libavcodec/loongson2/dsputil_loongson2.o: \ + libavcodec/loongson2/dsputil_loongson2.c \ + libavcodec/loongson2/dsputil_loongson2.h libavcodec/dsputil.h \ + libavutil/intreadwrite.h config.h libavutil/bswap.h \ + libavutil/attributes.h libavutil/common.h libavutil/intmath.h \ + libavutil/mem.h libavutil/internal.h libavutil/timer.h libavutil/libm.h \ + libavutil/mips/intreadwrite.h libavcodec/avcodec.h libavutil/avutil.h \ + libavutil/error.h libavutil/avutil.h libavutil/mathematics.h \ + libavutil/rational.h libavutil/intfloat_readwrite.h libavutil/log.h \ + libavutil/pixfmt.h libavutil/avconfig.h \ + libavcodec/loongson2/../simple_idct.h libavcodec/loongson2/../dsputil.h \ + libavcodec/loongson2/../mpegvideo.h libavcodec/loongson2/../get_bits.h \ + libavutil/bswap.h libavutil/common.h libavutil/log.h \ + libavcodec/loongson2/../mathops.h libavcodec/loongson2/../mips/mathops.h \ + libavcodec/loongson2/../put_bits.h libavcodec/loongson2/../ratecontrol.h \ + libavcodec/loongson2/../eval.h libavcodec/loongson2/../parser.h \ + libavcodec/loongson2/../avcodec.h libavcodec/loongson2/../mpeg12data.h \ + libavutil/rational.h libavcodec/loongson2/../rl.h diff --git a/libavcodec/loongson2/dsputil_loongson2.h b/libavcodec/loongson2/dsputil_loongson2.h new file mode 100644 index 0000000..87c7bd9 --- /dev/null +++ b/libavcodec/loongson2/dsputil_loongson2.h @@ -0,0 +1,3 @@ +#include "libavcodec/dsputil.h" + +void dsputil_init_pix_loongson2(DSPContext* c, AVCodecContext *avctx); diff --git a/libavcodec/loongson2/dsputil_loongson2.o b/libavcodec/loongson2/dsputil_loongson2.o new file mode 100644 index 0000000..fca0b55 Binary files /dev/null and b/libavcodec/loongson2/dsputil_loongson2.o differ diff --git a/libavcodec/loongson2/idct_loongson2.c b/libavcodec/loongson2/idct_loongson2.c new file mode 100644 index 0000000..539cab5 --- /dev/null +++ b/libavcodec/loongson2/idct_loongson2.c @@ -0,0 +1,336 @@ +/* + * Copyright (C) 2000-2003 Michel Lespinasse + * Copyright (C) 1999-2000 Aaron Holtzman + * + * Copyright (c) 2007-2010 comcat . + * + * Optimized for Loongson2 CPUs by comcat + * + * Based on i386 + */ + +#include "libavutil/common.h" +#include "dsputil_loongson2.h" + + +#define ATTR_ALIGN(align) __attribute__ ((__aligned__ (align))) + + +#define ROW_SHIFT 11 +#define COL_SHIFT 6 + +#define round(bias) ((int)(((bias)+0.5) * (1< + * + * Optimized for Loongson2 CPUs by comcat + * + * Based on i386 + * + */ + + +#include +#include "../avcodec.h" + +void ff_idct_xvid_loongson2(short *block); + +//============================================================================= +// Macros and other preprocessor constants +//============================================================================= + +#define BITS_INV_ACC 5 // 4 or 5 for IEEE +#define SHIFT_INV_ROW (16 - BITS_INV_ACC) //11 +#define SHIFT_INV_COL (1 + BITS_INV_ACC) //6 +#define RND_INV_ROW (1024 * (6 - BITS_INV_ACC)) +#define RND_INV_COL (16 * (BITS_INV_ACC - 3)) +#define RND_INV_CORR (RND_INV_COL - 1) + +#define BITS_FRW_ACC 3 // 2 or 3 for accuracy +#define SHIFT_FRW_COL BITS_FRW_ACC +#define SHIFT_FRW_ROW (BITS_FRW_ACC + 17) +#define RND_FRW_ROW (262144*(BITS_FRW_ACC - 1)) + + +//----------------------------------------------------------------------------- +// Various memory constants (trigonometric values or rounding values) +//----------------------------------------------------------------------------- + +static const int16_t tg_1_16[4*4] attribute_used __attribute__ ((aligned(8))) = { + 13036,13036,13036,13036, // tg * (2<<16) + 0.5 + 27146,27146,27146,27146, // tg * (2<<16) + 0.5 + -21746,-21746,-21746,-21746, // tg * (2<<16) + 0.5 + 23170,23170,23170,23170}; // cos * (2<<15) + 0.5 + +static const int32_t rounder_0[2*8] attribute_used __attribute__ ((aligned(8))) = { + 65536,65536, + 3597,3597, + 2260,2260, + 1203,1203, + 0,0, + 120,120, + 512,512, + 512,512}; + + +// Table for rows 0,4 - constants are multiplied by cos_4_16 +static const int16_t tab_i_04_mmx[32*4] attribute_used __attribute__ ((aligned(8))) = { + 16384,16384,16384,-16384, // movq-> w06 w04 w02 w00 + 21407,8867,8867,-21407, // w07 w05 w03 w01 + 16384,-16384,16384,16384, // w14 w12 w10 w08 + -8867,21407,-21407,-8867, // w15 w13 w11 w09 + 22725,12873,19266,-22725, // w22 w20 w18 w16 + 19266,4520,-4520,-12873, // w23 w21 w19 w17 + 12873,4520,4520,19266, // w30 w28 w26 w24 + -22725,19266,-12873,-22725, // w31 w29 w27 w25 +// Table for rows 1,7 - constants are multiplied by cos_1_16 + 22725,22725,22725,-22725, // movq-> w06 w04 w02 w00 + 29692,12299,12299,-29692, // w07 w05 w03 w01 + 22725,-22725,22725,22725, // w14 w12 w10 w08 + -12299,29692,-29692,-12299, // w15 w13 w11 w09 + 31521,17855,26722,-31521, // w22 w20 w18 w16 + 26722,6270,-6270,-17855, // w23 w21 w19 w17 + 17855,6270,6270,26722, // w30 w28 w26 w24 + -31521,26722,-17855,-31521, // w31 w29 w27 w25 +// Table for rows 2,6 - constants are multiplied by cos_2_16 + 21407,21407,21407,-21407, // movq-> w06 w04 w02 w00 + 27969,11585,11585,-27969, // w07 w05 w03 w01 + 21407,-21407,21407,21407, // w14 w12 w10 w08 + -11585,27969,-27969,-11585, // w15 w13 w11 w09 + 29692,16819,25172,-29692, // w22 w20 w18 w16 + 25172,5906,-5906,-16819, // w23 w21 w19 w17 + 16819,5906,5906,25172, // w30 w28 w26 w24 + -29692,25172,-16819,-29692, // w31 w29 w27 w25 +// Table for rows 3,5 - constants are multiplied by cos_3_16 + 19266,19266,19266,-19266, // movq-> w06 w04 w02 w00 + 25172,10426,10426,-25172, // w07 w05 w03 w01 + 19266,-19266,19266,19266, // w14 w12 w10 w08 + -10426,25172,-25172,-10426, // w15 w13 w11 w09 + 26722,15137,22654,-26722, // w22 w20 w18 w16 + 22654,5315,-5315,-15137, // w23 w21 w19 w17 + 15137,5315,5315,22654, // w30 w28 w26 w24 + -26722,22654,-15137,-26722, // w31 w29 w27 w25 +}; + + +// %3 for rows 0,4 - constants are multiplied by cos_4_16 +static const int16_t tab_i_04_xmm[32*4] attribute_used __attribute__ ((aligned(8))) = { + 16384,21407,16384,8867, // movq-> w05 w04 w01 w00 + 16384,8867,-16384,-21407, // w07 w06 w03 w02 + 16384,-8867,16384,-21407, // w13 w12 w09 w08 + -16384,21407,16384,-8867, // w15 w14 w11 w10 + 22725,19266,19266,-4520, // w21 w20 w17 w16 + 12873,4520,-22725,-12873, // w23 w22 w19 w18 + 12873,-22725,4520,-12873, // w29 w28 w25 w24 + 4520,19266,19266,-22725, // w31 w30 w27 w26 +// %3 for rows 1,7 - constants are multiplied by cos_1_16 + 22725,29692,22725,12299, // movq-> w05 w04 w01 w00 + 22725,12299,-22725,-29692, // w07 w06 w03 w02 + 22725,-12299,22725,-29692, // w13 w12 w09 w08 + -22725,29692,22725,-12299, // w15 w14 w11 w10 + 31521,26722,26722,-6270, // w21 w20 w17 w16 + 17855,6270,-31521,-17855, // w23 w22 w19 w18 + 17855,-31521,6270,-17855, // w29 w28 w25 w24 + 6270,26722,26722,-31521, // w31 w30 w27 w26 +// %3 for rows 2,6 - constants are multiplied by cos_2_16 + 21407,27969,21407,11585, // movq-> w05 w04 w01 w00 + 21407,11585,-21407,-27969, // w07 w06 w03 w02 + 21407,-11585,21407,-27969, // w13 w12 w09 w08 + -21407,27969,21407,-11585, // w15 w14 w11 w10 + 29692,25172,25172,-5906, // w21 w20 w17 w16 + 16819,5906,-29692,-16819, // w23 w22 w19 w18 + 16819,-29692,5906,-16819, // w29 w28 w25 w24 + 5906,25172,25172,-29692, // w31 w30 w27 w26 +// %3 for rows 3,5 - constants are multiplied by cos_3_16 + 19266,25172,19266,10426, // movq-> w05 w04 w01 w00 + 19266,10426,-19266,-25172, // w07 w06 w03 w02 + 19266,-10426,19266,-25172, // w13 w12 w09 w08 + -19266,25172,19266,-10426, // w15 w14 w11 w10 + 26722,22654,22654,-5315, // w21 w20 w17 w16 + 15137,5315,-26722,-15137, // w23 w22 w19 w18 + 15137,-26722,5315,-15137, // w29 w28 w25 w24 + 5315,22654,22654,-26722, // w31 w30 w27 w26 +}; + + + +#define DCT_8_INV_ROW_LOONGSON2(A1,A2,A3,A4)\ + "ldc1 $f0, " #A1 " \n\t"/* 0 ; x3 x2 x1 x0*/\ + "ldc1 $f2, 8+" #A1 " \n\t"/* 1 ; x7 x6 x5 x4*/\ + "mov.d $f4, $f0 \n\t"/* 2 ; x3 x2 x1 x0*/\ + "ldc1 $f6, " #A3 " \n\t"/* 3 ; w05 w04 w01 w00*/\ + "li $12, 0x88 \n\t"\ + "dmtc1 $12, $f16 \n\t"\ + "pshufh $f0, $f0, $f16 \n\t"/* x2 x0 x2 x0*/\ + "ldc1 $f8, 8+" #A3 " \n\t"/* 4 ; w07 w06 w03 w02*/\ + "mov.d $f10, $f2 \n\t"/* 5 ; x7 x6 x5 x4*/\ + "pmaddhw $f6, $f6, $f0 \n\t"/* x2*w05+x0*w04 x2*w01+x0*w00*/\ + "ldc1 $f12, 32+" #A3 " \n\t"/* 6 ; w21 w20 w17 w16*/\ + "pshufh $f2, $f2, $f16 \n\t"/* x6 x4 x6 x4*/\ + "pmaddhw $f8, $f8, $f2 \n\t"/* x6*w07+x4*w06 x6*w03+x4*w02*/\ + "li $12, 0xdd \n\t"\ + "dmtc1 $12, $f16 \n\t"\ + "ldc1 $f14, 40+" #A3 " \n\t"/* 7 ; w23 w22 w19 w18*/\ + "pshufh $f4, $f4, $f16 \n\t"/* x3 x1 x3 x1*/\ + "pmaddhw $f12, $f12, $f4 \n\t"/* x3*w21+x1*w20 x3*w17+x1*w16*/\ + "ldc1 $f18, " #A4 " \n\t" \ + "ldc1 $f20, 16+" #A3 " \n\t" \ + "ldc1 $f22, 24+" #A3 " \n\t" \ + "ldc1 $f24, 48+" #A3 " \n\t" \ + "ldc1 $f26, 56+" #A3 " \n\t" \ + "pshufh $f10, $f10, $f16 \n\t"/* x7 x5 x7 x5*/\ + "pmaddhw $f14, $f14, $f10 \n\t"/* x7*w23+x5*w22 x7*w19+x5*w18*/\ + "paddw $f6, $f6, $f18 \n\t"/* +%4*/\ + "pmaddhw $f0, $f0, $f20 \n\t"/* x2*w13+x0*w12 x2*w09+x0*w08*/\ + "paddw $f6, $f6, $f8 \n\t"/* 4 ; a1=sum(even1) a0=sum(even0)*/\ + "pmaddhw $f2, $f2, $f22 \n\t"/* x6*w15+x4*w14 x6*w11+x4*w10*/\ + "mov.d $f8, $f6 \n\t"/* 4 ; a1 a0*/\ + "li $12, 11 \n\t"\ + "dmtc1 $12, $f16 \n\t"\ + "pmaddhw $f4, $f4, $f24 \n\t"/* x3*w29+x1*w28 x3*w25+x1*w24*/\ + "paddw $f12, $f12, $f14 \n\t"/* 7 ; b1=sum(odd1) b0=sum(odd0)*/\ + "pmaddhw $f10, $f10, $f26 \n\t"/* x7*w31+x5*w30 x7*w27+x5*w26*/\ + "paddw $f6, $f6, $f12 \n\t"/* a1+b1 a0+b0*/\ + "paddw $f0, $f0, $f18 \n\t"/* +%4*/\ + "psraw $f6, $f6, $f16 \n\t"/* y1=a1+b1 y0=a0+b0*/\ + "paddw $f0, $f0, $f2 \n\t"/* 1 ; a3=sum(even3) a2=sum(even2)*/\ + "psubw $f8, $f8, $f12 \n\t"/* 6 ; a1-b1 a0-b0*/\ + "mov.d $f14, $f0 \n\t"/* 7 ; a3 a2*/\ + "paddw $f4, $f4, $f10 \n\t"/* 5 ; b3=sum(odd3) b2=sum(odd2)*/\ + "paddw $f0, $f0, $f4 \n\t"/* a3+b3 a2+b2*/\ + "psraw $f8, $f8, $f16 \n\t"/* y6=a1-b1 y7=a0-b0*/\ + "psubw $f14, $f14, $f4 \n\t"/* 2 ; a3-b3 a2-b2*/\ + "psraw $f0, $f0, $f16 \n\t"/* y3=a3+b3 y2=a2+b2*/\ + "psraw $f14, $f14, $f16 \n\t"/* y4=a3-b3 y5=a2-b2*/\ + "li $12, 0xb1 \n\t"\ + "dmtc1 $12, $f20 \n\t"\ + "packsswh $f6, $f6, $f0 \n\t"/* 0 ; y3 y2 y1 y0*/\ + "packsswh $f14, $f14, $f8 \n\t"/* 4 ; y6 y7 y4 y5*/\ + "sdc1 $f6, " #A2 " \n\t"/* 3 ; save y3 y2 y1 y0*/\ + "pshufh $f14, $f14, $f20 \n\t"/* y7 y6 y5 y4*/\ + "sdc1 $f14, 8 +" #A2 " \n\t"/* 7 ; save y7 y6 y5 y4*/\ + + +#define DCT_8_INV_COL(A1,A2)\ + "ldc1 $f0, 2*8(%3) \n\t"/* */\ + "ldc1 $f6, 16*3+" #A1 " \n\t"/* x3 */\ + "mov.d $f2, $f0 \n\t"/* tg_3_16*/\ + "ldc1 $f10, 16*5+" #A1 " \n\t"/* x5 */\ + "pmulhh $f0, $f0, $f6 \n\t"/* x3*(tg_3_16-1)*/\ + "ldc1 $f8, (%3) \n\t"\ + "pmulhh $f2, $f2, $f10 \n\t"/* x5*(tg_3_16-1)*/\ + "ldc1 $f14, 16*7+" #A1 " \n\t"/* x7 */\ + "mov.d $f4, $f8 \n\t"/* tg_1_16*/\ + "ldc1 $f12, 16*1+" #A1 " \n\t"/* x1 */\ + "pmulhh $f8, $f8, $f14 \n\t"/* x7*tg_1_16*/\ + "paddsh $f0, $f0, $f6 \n\t"/* x3*tg_3_16*/\ + "pmulhh $f4, $f4, $f12 \n\t"/* x1*tg_1_16*/\ + "paddsh $f2, $f2, $f6 \n\t"/* x3+x5*(tg_3_16-1)*/\ + "psubsh $f0, $f0, $f10 \n\t"/* x3*tg_3_16-x5 = tm35*/\ + "ldc1 $f6, 3*8(%3) \n\t"\ + "paddsh $f2, $f2, $f10 \n\t"/* x3+x5*tg_3_16 = tp35*/\ + "paddsh $f8, $f8, $f12 \n\t"/* x1+tg_1_16*x7 = tp17*/\ + "psubsh $f4, $f4, $f14 \n\t"/* x1*tg_1_16-x7 = tm17*/\ + "mov.d $f10, $f8 \n\t"/* tp17*/\ + "mov.d $f12, $f4 \n\t"/* tm17*/\ + "paddsh $f10, $f10, $f2 \n\t"/* tp17+tp35 = b0*/\ + "psubsh $f12, $f12, $f0 \n\t"/* tm17-tm35 = b3*/\ + "psubsh $f8, $f8, $f2 \n\t"/* tp17-tp35 = t1*/\ + "paddsh $f4, $f4, $f0 \n\t"/* tm17+tm35 = t2*/\ + "ldc1 $f14, 1*8(%3) \n\t"\ + "mov.d $f2, $f8 \n\t"/* t1*/\ + "sdc1 $f10, 3*16+" #A2 " \n\t"/* save b0*/\ + "paddsh $f2, $f2, $f4 \n\t"/* t1+t2*/\ + "sdc1 $f12, 5*16+" #A2 " \n\t"/* save b3*/\ + "psubsh $f8, $f8, $f4 \n\t"/* t1-t2*/\ + "ldc1 $f10, 2*16+" #A1 " \n\t"\ + "mov.d $f0, $f14 \n\t"/* tg_2_16*/\ + "ldc1 $f12, 6*16+" #A1 " \n\t"\ + "pmulhh $f0, $f0, $f10 \n\t"/* x2*tg_2_16*/\ + "pmulhh $f14, $f14, $f12 \n\t"/* x6*tg_2_16*/\ + "pmulhh $f2, $f2, $f6 \n\t"/* ocos_4_16*(t1+t2) = b1/2*/\ + "ldc1 $f4, 0*16+" #A1 " \n\t"\ + "pmulhh $f8, $f8, $f6 \n\t"/* ocos_4_16*(t1-t2) = b2/2*/\ + "psubsh $f0, $f0, $f12 \n\t"/* t2*tg_2_16-x6 = tm26*/\ + "mov.d $f6, $f4 \n\t"/* x0*/\ + "ldc1 $f12, 4*16+" #A1 " \n\t"\ + "paddsh $f14, $f14, $f10 \n\t"/* x2+x6*tg_2_16 = tp26*/\ + "paddsh $f4, $f4, $f12 \n\t"/* x0+x4 = tp04*/\ + "psubsh $f6, $f6, $f12 \n\t"/* x0-x4 = tm04*/\ + "mov.d $f10, $f4 \n\t"/* tp04*/\ + "mov.d $f12, $f6 \n\t"/* tm04*/\ + "psubsh $f4, $f4, $f14 \n\t"/* tp04-tp26 = a3*/\ + "paddsh $f6, $f6, $f0 \n\t"/* tm04+tm26 = a1*/\ + "paddsh $f2, $f2, $f2 \n\t"/* b1*/\ + "paddsh $f8, $f8, $f8 \n\t"/* b2*/\ + "paddsh $f10, $f10, $f14 \n\t"/* tp04+tp26 = a0*/\ + "psubsh $f12, $f12, $f0 \n\t"/* tm04-tm26 = a2*/\ + "li $12, 6 \n\t"\ + "dmtc1 $12, $f18 \n\t"\ + "mov.d $f14, $f6 \n\t"/* a1*/\ + "mov.d $f0, $f12 \n\t"/* a2*/\ + "paddsh $f6, $f6, $f2 \n\t"/* a1+b1*/\ + "paddsh $f12, $f12, $f8 \n\t"/* a2+b2*/\ + "psrah $f6, $f6, $f18 \n\t"/* dst1*/\ + "psubsh $f14, $f14, $f2 \n\t"/* a1-b1*/\ + "psrah $f12, $f12, $f18 \n\t"/* dst2*/\ + "psubsh $f0, $f0, $f8 \n\t"/* a2-b2*/\ + "ldc1 $f2, 3*16+" #A2 " \n\t"/* load b0*/\ + "psrah $f14, $f14, $f18 \n\t"/* dst6*/\ + "mov.d $f8, $f10 \n\t"/* a0*/\ + "psrah $f0, $f0, $f18 \n\t"/* dst5*/\ + "sdc1 $f6, 1*16+" #A2 " \n\t"\ + "paddsh $f10, $f10, $f2 \n\t"/* a0+b0*/\ + "sdc1 $f12, 2*16+" #A2 " \n\t"\ + "psubsh $f8, $f8, $f2 \n\t"/* a0-b0*/\ + "ldc1 $f6, 5*16+" #A2 " \n\t"/* load b3*/\ + "psrah $f10, $f10, $f18 \n\t"/* dst0*/\ + "mov.d $f12, $f4 \n\t"/* a3*/\ + "psrah $f8, $f8, $f18 \n\t"/* dst7*/\ + "sdc1 $f0, 5*16+" #A2 " \n\t"\ + "paddsh $f4, $f4, $f6 \n\t"/* a3+b3*/\ + "sdc1 $f14, 6*16+" #A2 " \n\t"\ + "psubsh $f12, $f12, $f6 \n\t"/* a3-b3*/\ + "sdc1 $f10, 0*16+" #A2 " \n\t"\ + "psrah $f4, $f4, $f18 \n\t"/* dst3*/\ + "sdc1 $f8, 7*16+" #A2 " \n\t"\ + "psrah $f12, $f12, $f18 \n\t"/* dst4*/\ + "sdc1 $f4, 3*16+" #A2 " \n\t"\ + "sdc1 $f12, 4*16+" #A2 " \n\t" + + + +void ff_idct_xvid_loongson2(short *block){ + __asm__ volatile( + //# Process each row + DCT_8_INV_ROW_LOONGSON2(0*16(%0), 0*16(%0), 64*0(%2), 8*0(%1)) + DCT_8_INV_ROW_LOONGSON2(1*16(%0), 1*16(%0), 64*1(%2), 8*1(%1)) + DCT_8_INV_ROW_LOONGSON2(2*16(%0), 2*16(%0), 64*2(%2), 8*2(%1)) + DCT_8_INV_ROW_LOONGSON2(3*16(%0), 3*16(%0), 64*3(%2), 8*3(%1)) + DCT_8_INV_ROW_LOONGSON2(4*16(%0), 4*16(%0), 64*0(%2), 8*4(%1)) + DCT_8_INV_ROW_LOONGSON2(5*16(%0), 5*16(%0), 64*3(%2), 8*5(%1)) + DCT_8_INV_ROW_LOONGSON2(6*16(%0), 6*16(%0), 64*2(%2), 8*6(%1)) + DCT_8_INV_ROW_LOONGSON2(7*16(%0), 7*16(%0), 64*1(%2), 8*7(%1)) + + //# Process the columns (4 at a time) + DCT_8_INV_COL(0(%0), 0(%0)) + DCT_8_INV_COL(8(%0), 8(%0)) + : + : "r"(block), "r"(rounder_0), "r"(tab_i_04_xmm), "r"(tg_1_16) + :"$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14","$f18","$f16","$20","$22","$24","$26"); +} + diff --git a/libavcodec/loongson2/motion_est_loongson2.c b/libavcodec/loongson2/motion_est_loongson2.c new file mode 100644 index 0000000..bb67290 --- /dev/null +++ b/libavcodec/loongson2/motion_est_loongson2.c @@ -0,0 +1,365 @@ +/* + * Loongson2E MMI optimized motion estimation + * Copyright (c) 2007 comcat . + * + * based on Michael Niedermayer + * + */ + +#include "dsputil_loongson2.h" +#include "../avcodec.h" + +static const __attribute__ ((aligned(8))) uint64_t round_tab[3]={ + 0x0000000000000000ULL, + 0x0001000100010001ULL, + 0x0002000200020002ULL, +}; + +static attribute_used __attribute__ ((aligned(8))) uint64_t bone= 0x0101010101010101LL; + +static inline void sad8_1_loongson2(uint8_t *blk1, uint8_t *blk2, int stride, int h) +{ + long len= -(stride*h); + __asm__ volatile( + +// ".set mips3 \n\t" + ".align 4 \n\t" + + "move $8, %0 \n\t" + "move $21, %1 \n\t" + "move $22, %2 \n\t" + "move $23, %3 \n\t" + + "1: \n\t" + + "add $9, $8, $21 \n\t" + "add $10, $8, $22 \n\t" + + "uld $11, ($9) \n\t" + "dmtc1 $11, $f0 \n\t" + + "uld $12, ($9) \n\t" + "dmtc1 $12, $f4 \n\t" + + "pasubub $f10, $f0, $f4 \n\t" + "biadd $f0, $f10 \n\t" + + "add $8, $8, $23 \n\t" + + "add $9, $8, $21 \n\t" + "add $10, $8, $22 \n\t" + + "uld $11, ($9) \n\t" + "dmtc1 $11, $f2 \n\t" + + "uld $12, ($10) \n\t" + "dmtc1 $12, $f6 \n\t" + + "pasubub $f16, $f2, $f6 \n\t" + "biadd $f6, $f16 \n\t" + + "paddh $f0, $f0, $f6 \n\t" + + "paddh $f12, $f12, $f0 \n\t" + + "bltz $8, 1b \n\t" + "add $8, $8, $23 \n\t" + + : "+r" (len) + : "r" (blk1 - len), "r" (blk2 - len), "r" ((long)stride) + : "$8", "$9", "$10", "$21", "$22", "$23", "$f0", "$f2", "$f4", "$f6", "$f10", "$f16" + ); +} + +static inline void sad8_2_loongson2(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h) +{ + long len= -(stride*h); + __asm__ volatile( + +// ".set mips3 \n\t" + ".align 4 \n\t" + + "move $8, %0 \n\t" + + "1: \n\t" + "add $9, $8, %1 \n\t" + "add $10, $8, %2 \n\t" + "add $11, $8, %3 \n\t" + + "uld $12, ($9) \n\t" + "dmtc1 $12, $f0 \n\t" + "uld $13, ($10) \n\t" + "dmtc1 $13, $f4 \n\t" + + "pavgb $f0, $f0, $f4 \n\t" + + "uld $12, ($11) \n\t" + "dmtc1 $12, $f4 \n\t" + + "pasubub $f10, $f0, $f4 \n\t" + "biadd $f0, $f10 \n\t" + + "add $8, $8, %4 \n\t" + + "add $9, $8, %1 \n\t" + "add $10, $8, %2 \n\t" + "add $11, $8, %3 \n\t" + + "uld $12, ($9) \n\t" + "dmtc1 $12, $f2 \n\t" + "uld $13, ($10) \n\t" + "dmtc1 $13, $f6 \n\t" + + "pavgb $f6, $f6, $f2 \n\t" + + "uld $12, ($11) \n\t" + "dmtc1 $12, $f2 \n\t" + + "pasubub $f16, $f6, $f2 \n\t" + "biadd $f6, $f16 \n\t" + + "paddh $f0, $f0, $f6 \n\t" + "paddh $f12, $f12, $f0 \n\t" + + "bltz $8, 1b \n\t" + "add $8, $8, %4 \n\t" + : "+r" (len) + : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" ((long)stride) + : "$8", "$9", "$10", "$11", "$12", "$13", "$f0", "$f2", "$f4", "$f6", "$f10", "$f16" + ); +} + +static inline void sad8_4_loongson2(uint8_t *blk1, uint8_t *blk2, int stride, int h) +{ + long len= -(stride*h); + __asm__ volatile( + + +// ".set mips3 \n\t" + ".align 4 \n\t" + + "ldc1 $f10, "MANGLE(bone)" \n\t" + + "move $8, %0 \n\t" + + "1: \n\t" + "add $9, $8, %1 \n\t" + "add $10, $8, %2 \n\t" + "add $11, $8, %3 \n\t" + + "uld $12, ($9) \n\t" + "dmtc1 $12, $f0 \n\t" + + "uld $13, ($10) \n\t" + "dmtc1 $13, $f4 \n\t" + + "uld $12, 1($9) \n\t" + "dmtc1 $12, $f2 \n\t" + + "uld $13, 1($10) \n\t" + "dmtc1 $13, $f6 \n\t" + + "pavgb $f0, $f0, $f4 \n\t" + "pavgb $f6, $f6, $f2 \n\t" + + "psubusb $f6, $f6, $f10 \n\t" + "pavgb $f0, $f0, $f6 \n\t" + + "uld $13, 1($11) \n\t" + "dmtc1 $13, $f4 \n\t" + + "pasubub $f16, $f0, $f4 \n\t" + "biadd $f0, $f16 \n\t" + + "add $8, $8, %4 \n\t" + + "add $9, $8, %1 \n\t" + "add $10, $8, %2 \n\t" + "add $11, $8, %3 \n\t" + + "uld $12, ($9) \n\t" + "dmtc1 $12, $f2 \n\t" + "uld $13, ($10) \n\t" + "dmtc1 $12, $f6 \n\t" + "uld $12, 1($9) \n\t" + "dmtc1 $12, $f4 \n\t" + "uld $13, 1($10) \n\t" + "dmtc1 $12, $f8 \n\t" + + "pavgb $f2, $f2, $f6 \n\t" + "pavgb $f4, $f4, $f8 \n\t" + + "psubusb $f4, $f4, $f10 \n\t" + "pavgb $f4, $f4, $f2 \n\t" + + "uld $13, ($11) \n\t" + "dmtc1 $13, $f2 \n\t" + + "pasubub $f18, $f4, $f2 \n\t" + "biadd $f4, $f18 \n\t" + + "paddh $f0, $f0, $f4 \n\t" + "paddh $f12, $f12, $f0 \n\t" + + "bltz $8, 1b \n\t" + "add $8, $8, %4 \n\t" + : "+r" (len) + : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len), "r" ((long)stride) + : "$8", "$9", "$10", "$11", "$12", "$13", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f16", "$f18" + ); +} + +static inline int sum_loongson2(void) +{ + int ret; + __asm__ volatile( +// ".set mips3 \n\t" + + "dmfc1 %0, $f12 \n\t" + : "=r" (ret) + ); + return ret; +} + + +static int sad8_loongson2(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h) +{ + assert(h==8); + __asm__ volatile( +// ".set mips3 \n\t" + "xor $f14, $f14, $f14 \n\t" + "xor $f12, $f12, $f12 \n\t" + : + ); + + sad8_1_loongson2(blk1, blk2, stride, 8); + + return sum_loongson2(); +} + +static int sad8_x2_loongson2(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h) +{ + assert(h==8); + __asm__ volatile( +// ".set mips3 \n\t" + "xor $f14, $f14, $f14 \n\t" + "xor $f12, $f12, $f12 \n\t" + + "ldc1 $f10, %0 \n\t" + :: "m"(round_tab[1]) + ); + + sad8_2_loongson2(blk1, blk1+1, blk2, stride, 8); + + return sum_loongson2(); +} + +static int sad8_y2_loongson2(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h) +{ + assert(h==8); + __asm__ volatile( +// ".set mips3 \n\t" + "xor $f14, $f14, $f14 \n\t" + "xor $f12, $f12, $f12 \n\t" + + "ldc1 $f10, %0 \n\t" + :: "m"(round_tab[1]) + ); + + sad8_2_loongson2(blk1, blk1+stride, blk2, stride, 8); + + return sum_loongson2(); +} + +static int sad8_xy2_loongson2(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h) +{ + assert(h==8); + __asm__ volatile( +// ".set mips3 \n\t" + "xor $f14, $f14, $f14 \n\t" + "xor $f12, $f12, $f12 \n\t" + "ldc1 $f10, %0 \n\t" + :: "m"(round_tab[2]) + ); + + sad8_4_loongson2(blk1, blk2, stride, 8); + + return sum_loongson2(); +} + +static int sad16_loongson2(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h) +{ + __asm__ volatile( +// ".set mips3 \n\t" + "xor $f14, $f14, $f14 \n\t" + "xor $f12, $f12, $f12 \n\t":); + + sad8_1_loongson2(blk1 , blk2 , stride, h); + sad8_1_loongson2(blk1+8, blk2+8, stride, h); + + return sum_loongson2(); +} + +static int sad16_x2_loongson2(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h) +{ + __asm__ volatile( +// ".set mips3 \n\t" + "xor $f14, $f14, $f14 \n\t" + "xor $f12, $f12, $f12 \n\t" + "ldc1 $f10, %0 \n\t" + :: "m"(round_tab[1]) + ); + + sad8_2_loongson2(blk1 , blk1+1, blk2 , stride, h); + sad8_2_loongson2(blk1+8, blk1+9, blk2+8, stride, h); + + return sum_loongson2(); +} + +static int sad16_y2_loongson2(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h) +{ + __asm__ volatile( +// ".set mips3 \n\t" + "xor $f14, $f14, $f14 \n\t" + "xor $f12, $f12, $f12 \n\t" + "ldc1 $f10, %0 \n\t" + :: "m"(round_tab[1]) + ); + + sad8_2_loongson2(blk1 , blk1+stride, blk2 , stride, h); + sad8_2_loongson2(blk1+8, blk1+stride+8,blk2+8, stride, h); + + return sum_loongson2(); +} + +static int sad16_xy2_loongson2(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h) +{ + __asm__ volatile( +// ".set mips3 \n\t" + "xor $f14, $f14, $f14 \n\t" + "xor $f12, $f12, $f12 \n\t" + "ldc1 $f10, %0 \n\t" + :: "m"(round_tab[2]) + ); + + sad8_4_loongson2(blk1 , blk2 , stride, h); + sad8_4_loongson2(blk1+8, blk2+8, stride, h); + + return sum_loongson2(); +} + + +void dsputil_init_pix_loongson2(DSPContext* c, AVCodecContext *avctx) +{ + c->pix_abs[0][0] = sad16_loongson2; + c->pix_abs[0][1] = sad16_x2_loongson2; + c->pix_abs[0][2] = sad16_y2_loongson2; + c->pix_abs[0][3] = sad16_xy2_loongson2; + c->pix_abs[1][0] = sad8_loongson2; + c->pix_abs[1][1] = sad8_x2_loongson2; + c->pix_abs[1][2] = sad8_y2_loongson2; + c->pix_abs[1][3] = sad8_xy2_loongson2; + + c->sad[0]= sad16_loongson2; + c->sad[1]= sad8_loongson2; +} diff --git a/libavcodec/loongson2/mpegvideo_loongson2.c b/libavcodec/loongson2/mpegvideo_loongson2.c new file mode 100644 index 0000000..18d070a --- /dev/null +++ b/libavcodec/loongson2/mpegvideo_loongson2.c @@ -0,0 +1,385 @@ +/* + * The simplest mpeg encoder (well, it was the simplest!) + * Copyright (c) 2007-2010 comcat . + * + * Optimized for Loongson2 CPUs by comcat + * + * Based on i386 + */ + +#include "dsputil_loongson2.h" +#include "../mpegvideo.h" +#include "../avcodec.h" + +extern uint8_t zigzag_direct_noperm[64]; +extern uint16_t inv_zigzag_direct16[64]; + +static const unsigned long long int mm_wabs __attribute__ ((aligned(8))) = 0xffffffffffffffffULL; +static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL; + + +static void dct_unquantize_h263_intra_loongson2(MpegEncContext *s, + DCTELEM *block, int n, int qscale) +{ + long level, qmul, qadd, nCoeffs; + + qmul = qscale << 1; + + assert(s->block_last_index[n]>=0 || s->h263_aic); + if (!s->h263_aic) { + if (n < 4) + level = block[0] * s->y_dc_scale; + else + level = block[0] * s->c_dc_scale; + qadd = (qscale - 1) | 1; + }else{ + qadd = 0; + level= block[0]; + } + if(s->ac_pred) + nCoeffs=63; + else + nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; + + + __asm__ volatile( +// ".set mips3 \n\t" + + "xor $f12, $f12, $f12 \n\t" + "lwc1 $f12, %1 \n\t" + + "xor $f10, $f10, $f10 \n\t" + + "packsswh $f12, $f12, $f12 \n\t" + + "lwc1 $f10, %2 \n\t" + + "packsswh $f10, $f10, $f10 \n\t" + + "packsswh $f12, $f12, $f12 \n\t" + + "xor $f14, $f14, $f14 \n\t" + + "packsswh $f10, $f10, $f10 \n\t" + + "xor $f8, $f8, $f8 \n\t" + + "psubh $f14, $f14, $f10 \n\t" + + + "1: \n\t" + "add $12, %0, %3 \n\t" + + "ldc1 $f0, ($12) \n\t" + + "ldc1 $f2, 8($12) \n\t" + + "mov.d $f4, $f0 \n\t" + "mov.d $f6, $f2 \n\t" + + "pmullh $f0, $f0, $f12 \n\t" + "pmullh $f2, $f2, $f12 \n\t" + + "pcmpgth $f4, $f4, $f8 \n\t" + "pcmpgth $f6, $f6, $f8 \n\t" + + "xor $f0, $f0, $f4 \n\t" + "xor $f2, $f2, $f6 \n\t" + + + "paddh $f0, $f0, $f14 \n\t" + + "paddh $f2, $f2, $f14 \n\t" + + + "xor $f4, $f4, $f0 \n\t" + + "xor $f6, $f6, $f2 \n\t" + + + "pcmpeqh $f0, $f0, $f14 \n\t" + + "pcmpeqh $f2, $f2, $f14 \n\t" + + + "pandn $f0, $f0, $f4 \n\t" + + "pandn $f2, $f2, $f6 \n\t" + + + "sdc1 $f0, ($12) \n\t" + + "sdc1 $f2, 8($12) \n\t" + + + "addiu %3, %3, 16 \n\t" + + "blez %3, 1b \n\t" + "nop \n\t" + ::"r" (block+nCoeffs), "m"(qmul), "m" (qadd), "r" (2*(-nCoeffs)) + : "memory" + ); + block[0]= level; +} + + +static void dct_unquantize_h263_inter_loongson2(MpegEncContext *s, + DCTELEM *block, int n, int qscale) +{ + long qmul, qadd, nCoeffs; + + qmul = qscale << 1; + qadd = (qscale - 1) | 1; + + assert(s->block_last_index[n]>=0 || s->h263_aic); + + nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; + + __asm__ volatile( +// ".set mips3 \n\t" + + "xor $f12, $f12, $f12 \n\t" + "lwc1 $f12, %1 \n\t" + + "xor $f10, $f10, $f10 \n\t" + + "packsswh $f12, $f12, $f12 \n\t" + + "lwc1 $f10, %2 \n\t" + + "packsswh $f10, $f10, $f10 \n\t" + + "xor $f14, $f14, $f14 \n\t" + + "packsswh $f12, $f12, $f12 \n\t" + + "packsswh $f10, $f10, $f10 \n\t" + + "xor $f8, $f8, $f8 \n\t" + + "psubh $f14, $f14, $f10 \n\t" + + + "1: \n\t" + "add $12, %0, %3 \n\t" + + "ldc1 $f0, ($12) \n\t" + + "ldc1 $f2, 8($12) \n\t" + + "mov.d $f4, $f0 \n\t" + "mov.d $f6, $f2 \n\t" + + "pmullh $f0, $f0, $f12 \n\t" + + "pmullh $f2, $f2, $f12 \n\t" + + "pcmpgth $f4, $f4, $f8 \n\t" + + "pcmpgth $f6, $f6, $f8 \n\t" + + "xor $f0, $f0, $f4 \n\t" + + "xor $f2, $f2, $f6 \n\t" + + "paddh $f0, $f0, $f14 \n\t" + + "paddh $f2, $f2, $f14 \n\t" + + "xor $f4, $f4, $f0 \n\t" + + "xor $f6, $f6, $f2 \n\t" + + "pcmpeqh $f0, $f0, $f14 \n\t" + + "pcmpeqh $f2, $f2, $f14 \n\t" + + "pandn $f0, $f0, $f4 \n\t" + + "pandn $f2, $f2, $f6 \n\t" + + "sdc1 $f0, ($12) \n\t" + + "sdc1 $f2, 8($12) \n\t" + + + "addiu %3, %3, 16 \n\t" + + "blez %3, 1b \n\t" + "nop \n\t" + ::"r" (block+nCoeffs), "m"(qmul), "m" (qadd), "r" (2*(-nCoeffs)) + : "memory" + ); +} + + +/* draw the edges of width 'w' of an image of size width, height + this mmx version can only handle w==8 || w==16 */ + +static void draw_edges_loongson2(uint8_t *buf, int wrap, int width, int height, int w) +{ + uint8_t *ptr, *last_line; + int i; + + last_line = buf + (height - 1) * wrap; + + ptr = buf; + if(w==8) + { + __asm__ volatile( +// ".set mips3 \n\t" + + "move $9, %0 \n\t" + + "1: \n\t" + + "xor $f0, $f0, $f0 \n\t" + "lwc1 $f0, ($9) \n\t" + + "punpcklbh $f0, $f0, $f0 \n\t" + + "add $12, $9, %2 \n\t" + + "punpcklhw $f0, $f0, $f0 \n\t" + + "punpcklwd $f0, $f0, $f0 \n\t" + + "ldc1 $f2, -8($12) \n\t" + + "sdc1 $f0, -8($9) \n\t" + + "punpckhbh $f2, $f2, $f2 \n\t" + + "add $9, $9, %1 \n\t" + + "punpckhhw $f2, $f2, $f2 \n\t" + + "sub $13, $9, %3 \n\t" + + "punpckhwd $f2, $f2, $f2 \n\t" + + "bltz $13, 1b \n\t" + + "sdc1 $f2, ($12) \n\t" + + : "+r" (ptr) + : "r" ((long)wrap), "r" ((long)width), "r" (ptr + wrap*height) + : "$9", "$13", "$12", "$f2", "$f0" + ); + } + else + { + __asm__ volatile( +// ".set mips3 \n\t" + + "move $8, %0 \n\t" + + "1: \n\t" + + "xor $f0, $f0, $f0 \n\t" + "lwc1 $f0, ($8) \n\t" + + "punpcklbh $f0, $f0, $f0 \n\t" + "punpcklhw $f0, $f0, $f0 \n\t" + "punpcklwd $f0, $f0, $f0 \n\t" + + "sdc1 $f0, -8($8) \n\t" + "sdc1 $f0, -16($8) \n\t" + + "add $15, $8, %2 \n\t" + "ldc1 $f2, -8($15) \n\t" + + "punpckhbh $f2, $f2, $f2 \n\t" + "punpckhhw $f2, $f2, $f2 \n\t" + "punpckhwd $f2, $f2, $f2 \n\t" + + "sdc1 $f2, ($15) \n\t" + "sdc1 $f2, 8($15) \n\t" + + "add $8, $8, %1 \n\t" + + "sub $16, $8, %3 \n\t" + "bltz $16, 1b \n\t" + "nop \n\t" + : "+r" (ptr) + : "r" ((long)wrap), "r" ((long)width), "r" (ptr + wrap*height) + : "$8", "$15", "$16", "$f0", "$f2" + ); + } + + for(i=0;idct_unquantize_h263_intra = dct_unquantize_h263_intra_loongson2; + s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_loongson2; + +// draw_edges = draw_edges_loongson2; + +} diff --git a/libavcodec/mips/Makefile b/libavcodec/mips/Makefile index 3f4da68..73e4d56 100644 --- a/libavcodec/mips/Makefile +++ b/libavcodec/mips/Makefile @@ -1,3 +1,9 @@ OBJS-$(HAVE_MMI) += ps2/dsputil_mmi.o \ ps2/idct_mmi.o \ ps2/mpegvideo_mmi.o \ + +OBJS-$(HAVE_LOONGSON2MMI) += loongson2/idct_loongson2.o \ + loongson2/dsputil_loongson2.o \ + loongson2/idct_loongson2_xvid.o \ + loongson2/mpegvideo_loongson2.o \ + loongson2/motion_est_loongson2.o diff --git a/libavcodec/mpegvideo.c b/libavcodec/mpegvideo.c index b47ff9a..af92552 100644 --- a/libavcodec/mpegvideo.c +++ b/libavcodec/mpegvideo.c @@ -176,6 +176,9 @@ av_cold int ff_dct_common_init(MpegEncContext *s) #elif ARCH_BFIN MPV_common_init_bfin(s); #endif +#ifdef HAVE_LOONGSON2MMI + MPV_common_init_loongson2(s); +#endif /* load & permutate scantables note: only wmv uses different ones diff --git a/libavcodec/mpegvideo.h b/libavcodec/mpegvideo.h index 5302be9..8d09906 100644 --- a/libavcodec/mpegvideo.h +++ b/libavcodec/mpegvideo.h @@ -689,6 +689,7 @@ int MPV_encode_picture(AVCodecContext *avctx, unsigned char *buf, int buf_size, void MPV_common_init_mmx(MpegEncContext *s); void MPV_common_init_axp(MpegEncContext *s); void MPV_common_init_mlib(MpegEncContext *s); +void MPV_common_init_loongson2(MpegEncContext *s); void MPV_common_init_mmi(MpegEncContext *s); void MPV_common_init_arm(MpegEncContext *s); void MPV_common_init_altivec(MpegEncContext *s); diff --git a/libavcodec/options.c b/libavcodec/options.c index 7ca1062..c05b3f4 100644 --- a/libavcodec/options.c +++ b/libavcodec/options.c @@ -205,6 +205,8 @@ static const AVOption options[]={ {"simple", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_SIMPLE, INT_MIN, INT_MAX, V|E|D, "idct"}, {"simplemmx", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_SIMPLEMMX, INT_MIN, INT_MAX, V|E|D, "idct"}, {"libmpeg2mmx", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_LIBMPEG2MMX, INT_MIN, INT_MAX, V|E|D, "idct"}, +{"libmpeg2loongson2", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_LIBMPEG2LOONGSON2, INT_MIN, INT_MAX, V|E|D, "idct"}, +{"xvidloongson2", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_XVIDLOONGSON2, INT_MIN, INT_MAX, V|E|D, "idct"}, {"ps2", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_PS2, INT_MIN, INT_MAX, V|E|D, "idct"}, {"mlib", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_MLIB, INT_MIN, INT_MAX, V|E|D, "idct"}, {"arm", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_ARM, INT_MIN, INT_MAX, V|E|D, "idct"},