00001 #include <stdio.h>
00002 #include <stdlib.h>
00003 #include <time.h>
00004
00005
00006
00007 static unsigned long MMX_AVGDIFF_1[] = {0x00010001, 0x00010001};
00008 static unsigned long MMX_ACCUM_AND[] = {0xffffffff, 0x00000000};
00009
00010 void inline mmx_start_block()
00011 {
00012 asm("
00013 .align 8
00014 pxor %%mm7, %%mm7; // Zero totals
00015 " : : );
00016 }
00017
00018 void inline mmx_avgdiff(unsigned char *p1, unsigned char *p2, unsigned char *p3)
00019 {
00020 asm("
00021 .align 8
00022 movq (%%ebx), %%mm0; // Load 8 pixels from a
00023 pxor %%mm4, %%mm4; // Zero out temp for unpacking a
00024 movq %%mm0, %%mm2; // Make a copy of a for unpacking
00025 movq (%%ecx), %%mm1; // Load 8 pixels from b
00026 pxor %%mm3, %%mm3; // Zero out b's upper unpacked destination
00027 punpcklbw %%mm4, %%mm2; // Unpack lower 4 pixels from a for addition
00028 movq %%mm1, %%mm5; // Copy b for unpacking
00029 punpckhbw %%mm4, %%mm0; // Unpack upper 4 pixels from a for addition
00030 punpcklbw %%mm3, %%mm5; // Unpack lower 4 pixels from b for addition
00031 paddw %%mm2, %%mm5; // Add lower a and lower b unpacked
00032 punpckhbw %%mm3, %%mm1; // Unpack upper 4 pixels from b for addition
00033 paddw %%mm0, %%mm1; // Add upper a and upper b unpacked
00034 movq (%%edx), %%mm2; // Load c for difference
00035 paddw MMX_AVGDIFF_1, %%mm5; // Add 1 to the result of lower a + b
00036 pxor %%mm4, %%mm4; // Zero out temp for c unpacking
00037 movq %%mm2, %%mm3; // Make a copy of c for unpacking
00038 paddw MMX_AVGDIFF_1, %%mm1; // Add 1 to the result of upper a + b
00039 punpcklbw %%mm4, %%mm3; // Unpack lower 4 pixels from c for subtraction
00040 punpckhbw %%mm4, %%mm2; // Unpack upper 4 pixels from c
00041 movq %%mm3, %%mm0; // Make a copy of lower c for absdiff
00042 psraw $1, %%mm5; // Divide result of lower a + b by 2
00043 movq %%mm2, %%mm4; // Make a copy of upper c for absdiff
00044 psraw $1, %%mm1; // Divide result of upper a + b by 2
00045 psubusw %%mm5, %%mm3; // Subtract lower pixels one way
00046 psubusw %%mm1, %%mm2; // Subtract upper pixels one way
00047 psubusw %%mm0, %%mm5; // Subtract lower pixels the other way
00048 por %%mm5, %%mm3; // Or the result of the lower pixels
00049 psubusw %%mm4, %%mm1; // Subtract upper pixels the other way
00050 por %%mm1, %%mm2; // Or the result of the upper pixels
00051 paddw %%mm3, %%mm7; // Accumulate lower pixels
00052 paddw %%mm2, %%mm7; // Accumulate upper pixels
00053 "
00054 :
00055 : "b" (p1), "c" (p2), "d" (p3));
00056 }
00057
00058 unsigned int mmx_accum_avgdiff()
00059 {
00060 unsigned long long r = 0;
00061 asm("
00062 .align 8
00063 pxor %%mm5, %%mm5; // Clear temp for unpacking
00064 movq %%mm7, %%mm6; // Make a copy for unpacking
00065 punpcklwd %%mm5, %%mm6; // Unpack lower 2 pixels for accumulation
00066 punpckhwd %%mm5, %%mm7; // Unpack high 2 pixels for accumulation
00067 paddw %%mm6, %%mm7; // Add 2 doublewords in each register
00068 movq %%mm7, %%mm6; // Copy the result for a final add
00069 pand MMX_ACCUM_AND, %%mm7; // And the result for accumulation
00070 psrlq $32, %%mm6; // Shift the copy right for accumulation
00071 paddd %%mm6, %%mm7; // Add the results
00072 movq %%mm7, (%%ebx); // Store result
00073 emms;
00074 "
00075 : : "b" (&r));
00076
00077 return (unsigned int)r;
00078 }
00079
00080
00081 unsigned int mmx_test(unsigned char *result)
00082 {
00083 unsigned long long r = 255;
00084 asm("
00085 .align 8
00086 movq (%%ecx), %%mm0;
00087 movq (%%ecx), %%mm1;
00088 paddd %%mm0, %%mm1;
00089 movq %%mm1, (%%ebx);
00090 movq %%mm0, (%%ecx);
00091 "
00092 :
00093 : "b" (result), "c" (&r));
00094 return r;
00095 }
00096
00097 int main(int argc, char *argv[])
00098 {
00099 unsigned char pixels1[9] = { 13, 13, 12, 11, 11, 10, 9, 9, 10 };
00100 unsigned char pixels3[8] = { 15, 10, 7, 8, 14, 19, 21, 20 };
00101 unsigned char *p1, *p2, *p3;
00102 unsigned int result;
00103
00104 p1 = &pixels1[0];
00105 p2 = &pixels1[1];
00106 p3 = &pixels3[0];
00107
00108 printf("%d %d %d %d %d %d %d %d %d\n", p1[0], p1[1], p1[2], p1[3], p1[4], p1[5], p1[6], p1[7], p1[8]);
00109 printf("%d %d %d %d %d %d %d %d\n", p3[0], p3[1], p3[2], p3[3], p3[4], p3[5], p3[6], p3[7]);
00110 printf("-----------------------\n");
00111 mmx_start_block();
00112 mmx_avgdiff(p1, p2, p3);
00113 result = mmx_accum_avgdiff();
00114
00115
00116 printf("%d %d %d %d %d %d %d %d %d\n", p1[0], p1[1], p1[2], p1[3], p1[4], p1[5], p1[6], p1[7], p1[8]);
00117 printf("%d %d %d %d %d %d %d %d\n", p3[0], p3[1], p3[2], p3[3], p3[4], p3[5], p3[6], p3[7]);
00118 printf("%d\n", result);
00119 }