00001 void DEF(put, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00002 {
00003 int stride = line_size;
00004 __asm__ __volatile__ (
00005 "and r12, %[pixels], #7 \n\t"
00006 "bic %[pixels], %[pixels], #7 \n\t"
00007 "tmcr wcgr1, r12 \n\t"
00008 "add r4, %[pixels], %[line_size] \n\t"
00009 "add r5, %[block], %[line_size] \n\t"
00010 "mov %[line_size], %[line_size], lsl #1 \n\t"
00011 "1: \n\t"
00012 "wldrd wr0, [%[pixels]] \n\t"
00013 "subs %[h], %[h], #2 \n\t"
00014 "wldrd wr1, [%[pixels], #8] \n\t"
00015 "add %[pixels], %[pixels], %[line_size] \n\t"
00016 "wldrd wr3, [r4] \n\t"
00017 "pld [%[pixels]] \n\t"
00018 "pld [%[pixels], #32] \n\t"
00019 "wldrd wr4, [r4, #8] \n\t"
00020 "add r4, r4, %[line_size] \n\t"
00021 "walignr1 wr8, wr0, wr1 \n\t"
00022 "pld [r4] \n\t"
00023 "pld [r4, #32] \n\t"
00024 "walignr1 wr10, wr3, wr4 \n\t"
00025 "wstrd wr8, [%[block]] \n\t"
00026 "add %[block], %[block], %[line_size] \n\t"
00027 "wstrd wr10, [r5] \n\t"
00028 "add r5, r5, %[line_size] \n\t"
00029 "bne 1b \n\t"
00030 : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
00031 :
00032 : "memory", "r4", "r5", "r12");
00033 }
00034
00035 void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00036 {
00037 int stride = line_size;
00038 __asm__ __volatile__ (
00039 "and r12, %[pixels], #7 \n\t"
00040 "bic %[pixels], %[pixels], #7 \n\t"
00041 "tmcr wcgr1, r12 \n\t"
00042 "add r4, %[pixels], %[line_size] \n\t"
00043 "add r5, %[block], %[line_size] \n\t"
00044 "mov %[line_size], %[line_size], lsl #1 \n\t"
00045 "1: \n\t"
00046 "wldrd wr0, [%[pixels]] \n\t"
00047 "subs %[h], %[h], #2 \n\t"
00048 "wldrd wr1, [%[pixels], #8] \n\t"
00049 "add %[pixels], %[pixels], %[line_size] \n\t"
00050 "wldrd wr3, [r4] \n\t"
00051 "pld [%[pixels]] \n\t"
00052 "pld [%[pixels], #32] \n\t"
00053 "wldrd wr4, [r4, #8] \n\t"
00054 "add r4, r4, %[line_size] \n\t"
00055 "walignr1 wr8, wr0, wr1 \n\t"
00056 "wldrd wr0, [%[block]] \n\t"
00057 "wldrd wr2, [r5] \n\t"
00058 "pld [r4] \n\t"
00059 "pld [r4, #32] \n\t"
00060 "walignr1 wr10, wr3, wr4 \n\t"
00061 WAVG2B" wr8, wr8, wr0 \n\t"
00062 WAVG2B" wr10, wr10, wr2 \n\t"
00063 "wstrd wr8, [%[block]] \n\t"
00064 "add %[block], %[block], %[line_size] \n\t"
00065 "wstrd wr10, [r5] \n\t"
00066 "pld [%[block]] \n\t"
00067 "pld [%[block], #32] \n\t"
00068 "add r5, r5, %[line_size] \n\t"
00069 "pld [r5] \n\t"
00070 "pld [r5, #32] \n\t"
00071 "bne 1b \n\t"
00072 : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
00073 :
00074 : "memory", "r4", "r5", "r12");
00075 }
00076
00077 void DEF(put, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00078 {
00079 int stride = line_size;
00080 __asm__ __volatile__ (
00081 "and r12, %[pixels], #7 \n\t"
00082 "bic %[pixels], %[pixels], #7 \n\t"
00083 "tmcr wcgr1, r12 \n\t"
00084 "add r4, %[pixels], %[line_size] \n\t"
00085 "add r5, %[block], %[line_size] \n\t"
00086 "mov %[line_size], %[line_size], lsl #1 \n\t"
00087 "1: \n\t"
00088 "wldrd wr0, [%[pixels]] \n\t"
00089 "wldrd wr1, [%[pixels], #8] \n\t"
00090 "subs %[h], %[h], #2 \n\t"
00091 "wldrd wr2, [%[pixels], #16] \n\t"
00092 "add %[pixels], %[pixels], %[line_size] \n\t"
00093 "wldrd wr3, [r4] \n\t"
00094 "pld [%[pixels]] \n\t"
00095 "pld [%[pixels], #32] \n\t"
00096 "walignr1 wr8, wr0, wr1 \n\t"
00097 "wldrd wr4, [r4, #8] \n\t"
00098 "walignr1 wr9, wr1, wr2 \n\t"
00099 "wldrd wr5, [r4, #16] \n\t"
00100 "add r4, r4, %[line_size] \n\t"
00101 "pld [r4] \n\t"
00102 "pld [r4, #32] \n\t"
00103 "walignr1 wr10, wr3, wr4 \n\t"
00104 "wstrd wr8, [%[block]] \n\t"
00105 "walignr1 wr11, wr4, wr5 \n\t"
00106 "wstrd wr9, [%[block], #8] \n\t"
00107 "add %[block], %[block], %[line_size] \n\t"
00108 "wstrd wr10, [r5] \n\t"
00109 "wstrd wr11, [r5, #8] \n\t"
00110 "add r5, r5, %[line_size] \n\t"
00111 "bne 1b \n\t"
00112 : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
00113 :
00114 : "memory", "r4", "r5", "r12");
00115 }
00116
00117 void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00118 {
00119 int stride = line_size;
00120 __asm__ __volatile__ (
00121 "pld [%[pixels]] \n\t"
00122 "pld [%[pixels], #32] \n\t"
00123 "pld [%[block]] \n\t"
00124 "pld [%[block], #32] \n\t"
00125 "and r12, %[pixels], #7 \n\t"
00126 "bic %[pixels], %[pixels], #7 \n\t"
00127 "tmcr wcgr1, r12 \n\t"
00128 "add r4, %[pixels], %[line_size]\n\t"
00129 "add r5, %[block], %[line_size] \n\t"
00130 "mov %[line_size], %[line_size], lsl #1 \n\t"
00131 "1: \n\t"
00132 "wldrd wr0, [%[pixels]] \n\t"
00133 "wldrd wr1, [%[pixels], #8] \n\t"
00134 "subs %[h], %[h], #2 \n\t"
00135 "wldrd wr2, [%[pixels], #16] \n\t"
00136 "add %[pixels], %[pixels], %[line_size] \n\t"
00137 "wldrd wr3, [r4] \n\t"
00138 "pld [%[pixels]] \n\t"
00139 "pld [%[pixels], #32] \n\t"
00140 "walignr1 wr8, wr0, wr1 \n\t"
00141 "wldrd wr4, [r4, #8] \n\t"
00142 "walignr1 wr9, wr1, wr2 \n\t"
00143 "wldrd wr5, [r4, #16] \n\t"
00144 "add r4, r4, %[line_size] \n\t"
00145 "wldrd wr0, [%[block]] \n\t"
00146 "pld [r4] \n\t"
00147 "wldrd wr1, [%[block], #8] \n\t"
00148 "pld [r4, #32] \n\t"
00149 "wldrd wr2, [r5] \n\t"
00150 "walignr1 wr10, wr3, wr4 \n\t"
00151 "wldrd wr3, [r5, #8] \n\t"
00152 WAVG2B" wr8, wr8, wr0 \n\t"
00153 WAVG2B" wr9, wr9, wr1 \n\t"
00154 WAVG2B" wr10, wr10, wr2 \n\t"
00155 "wstrd wr8, [%[block]] \n\t"
00156 "walignr1 wr11, wr4, wr5 \n\t"
00157 WAVG2B" wr11, wr11, wr3 \n\t"
00158 "wstrd wr9, [%[block], #8] \n\t"
00159 "add %[block], %[block], %[line_size] \n\t"
00160 "wstrd wr10, [r5] \n\t"
00161 "pld [%[block]] \n\t"
00162 "pld [%[block], #32] \n\t"
00163 "wstrd wr11, [r5, #8] \n\t"
00164 "add r5, r5, %[line_size] \n\t"
00165 "pld [r5] \n\t"
00166 "pld [r5, #32] \n\t"
00167 "bne 1b \n\t"
00168 : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
00169 :
00170 : "memory", "r4", "r5", "r12");
00171 }
00172
00173 void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00174 {
00175 int stride = line_size;
00176
00177
00178 SET_RND(wr15);
00179 __asm__ __volatile__(
00180 "pld [%[pixels]] \n\t"
00181 "pld [%[pixels], #32] \n\t"
00182 "and r12, %[pixels], #7 \n\t"
00183 "bic %[pixels], %[pixels], #7 \n\t"
00184 "tmcr wcgr1, r12 \n\t"
00185 "add r12, r12, #1 \n\t"
00186 "add r4, %[pixels], %[line_size]\n\t"
00187 "tmcr wcgr2, r12 \n\t"
00188 "add r5, %[block], %[line_size] \n\t"
00189 "mov %[line_size], %[line_size], lsl #1 \n\t"
00190
00191 "1: \n\t"
00192 "wldrd wr10, [%[pixels]] \n\t"
00193 "cmp r12, #8 \n\t"
00194 "wldrd wr11, [%[pixels], #8] \n\t"
00195 "add %[pixels], %[pixels], %[line_size] \n\t"
00196 "wldrd wr13, [r4] \n\t"
00197 "pld [%[pixels]] \n\t"
00198 "wldrd wr14, [r4, #8] \n\t"
00199 "pld [%[pixels], #32] \n\t"
00200 "add r4, r4, %[line_size] \n\t"
00201 "walignr1 wr0, wr10, wr11 \n\t"
00202 "pld [r4] \n\t"
00203 "pld [r4, #32] \n\t"
00204 "walignr1 wr2, wr13, wr14 \n\t"
00205 "wmoveq wr4, wr11 \n\t"
00206 "wmoveq wr6, wr14 \n\t"
00207 "walignr2ne wr4, wr10, wr11 \n\t"
00208 "walignr2ne wr6, wr13, wr14 \n\t"
00209 WAVG2B" wr0, wr0, wr4 \n\t"
00210 WAVG2B" wr2, wr2, wr6 \n\t"
00211 "wstrd wr0, [%[block]] \n\t"
00212 "subs %[h], %[h], #2 \n\t"
00213 "wstrd wr2, [r5] \n\t"
00214 "add %[block], %[block], %[line_size] \n\t"
00215 "add r5, r5, %[line_size] \n\t"
00216 "bne 1b \n\t"
00217 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
00218 :
00219 : "r4", "r5", "r12", "memory");
00220 }
00221
00222 void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00223 {
00224 int stride = line_size;
00225
00226
00227 SET_RND(wr15);
00228 __asm__ __volatile__(
00229 "pld [%[pixels]] \n\t"
00230 "pld [%[pixels], #32] \n\t"
00231 "and r12, %[pixels], #7 \n\t"
00232 "bic %[pixels], %[pixels], #7 \n\t"
00233 "tmcr wcgr1, r12 \n\t"
00234 "add r12, r12, #1 \n\t"
00235 "add r4, %[pixels], %[line_size]\n\t"
00236 "tmcr wcgr2, r12 \n\t"
00237 "add r5, %[block], %[line_size] \n\t"
00238 "mov %[line_size], %[line_size], lsl #1 \n\t"
00239
00240 "1: \n\t"
00241 "wldrd wr10, [%[pixels]] \n\t"
00242 "cmp r12, #8 \n\t"
00243 "wldrd wr11, [%[pixels], #8] \n\t"
00244 "wldrd wr12, [%[pixels], #16] \n\t"
00245 "add %[pixels], %[pixels], %[line_size] \n\t"
00246 "wldrd wr13, [r4] \n\t"
00247 "pld [%[pixels]] \n\t"
00248 "wldrd wr14, [r4, #8] \n\t"
00249 "pld [%[pixels], #32] \n\t"
00250 "wldrd wr15, [r4, #16] \n\t"
00251 "add r4, r4, %[line_size] \n\t"
00252 "walignr1 wr0, wr10, wr11 \n\t"
00253 "pld [r4] \n\t"
00254 "pld [r4, #32] \n\t"
00255 "walignr1 wr1, wr11, wr12 \n\t"
00256 "walignr1 wr2, wr13, wr14 \n\t"
00257 "walignr1 wr3, wr14, wr15 \n\t"
00258 "wmoveq wr4, wr11 \n\t"
00259 "wmoveq wr5, wr12 \n\t"
00260 "wmoveq wr6, wr14 \n\t"
00261 "wmoveq wr7, wr15 \n\t"
00262 "walignr2ne wr4, wr10, wr11 \n\t"
00263 "walignr2ne wr5, wr11, wr12 \n\t"
00264 "walignr2ne wr6, wr13, wr14 \n\t"
00265 "walignr2ne wr7, wr14, wr15 \n\t"
00266 WAVG2B" wr0, wr0, wr4 \n\t"
00267 WAVG2B" wr1, wr1, wr5 \n\t"
00268 "wstrd wr0, [%[block]] \n\t"
00269 WAVG2B" wr2, wr2, wr6 \n\t"
00270 "wstrd wr1, [%[block], #8] \n\t"
00271 WAVG2B" wr3, wr3, wr7 \n\t"
00272 "add %[block], %[block], %[line_size] \n\t"
00273 "wstrd wr2, [r5] \n\t"
00274 "subs %[h], %[h], #2 \n\t"
00275 "wstrd wr3, [r5, #8] \n\t"
00276 "add r5, r5, %[line_size] \n\t"
00277 "bne 1b \n\t"
00278 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
00279 :
00280 : "r4", "r5", "r12", "memory");
00281 }
00282
00283 void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00284 {
00285 int stride = line_size;
00286
00287
00288 SET_RND(wr15);
00289 __asm__ __volatile__(
00290 "pld [%[pixels]] \n\t"
00291 "pld [%[pixels], #32] \n\t"
00292 "pld [%[block]] \n\t"
00293 "pld [%[block], #32] \n\t"
00294 "and r12, %[pixels], #7 \n\t"
00295 "bic %[pixels], %[pixels], #7 \n\t"
00296 "tmcr wcgr1, r12 \n\t"
00297 "add r12, r12, #1 \n\t"
00298 "add r4, %[pixels], %[line_size]\n\t"
00299 "tmcr wcgr2, r12 \n\t"
00300 "add r5, %[block], %[line_size] \n\t"
00301 "mov %[line_size], %[line_size], lsl #1 \n\t"
00302 "pld [r5] \n\t"
00303 "pld [r5, #32] \n\t"
00304
00305 "1: \n\t"
00306 "wldrd wr10, [%[pixels]] \n\t"
00307 "cmp r12, #8 \n\t"
00308 "wldrd wr11, [%[pixels], #8] \n\t"
00309 "add %[pixels], %[pixels], %[line_size] \n\t"
00310 "wldrd wr13, [r4] \n\t"
00311 "pld [%[pixels]] \n\t"
00312 "wldrd wr14, [r4, #8] \n\t"
00313 "pld [%[pixels], #32] \n\t"
00314 "add r4, r4, %[line_size] \n\t"
00315 "walignr1 wr0, wr10, wr11 \n\t"
00316 "pld [r4] \n\t"
00317 "pld [r4, #32] \n\t"
00318 "walignr1 wr2, wr13, wr14 \n\t"
00319 "wmoveq wr4, wr11 \n\t"
00320 "wmoveq wr6, wr14 \n\t"
00321 "walignr2ne wr4, wr10, wr11 \n\t"
00322 "wldrd wr10, [%[block]] \n\t"
00323 "walignr2ne wr6, wr13, wr14 \n\t"
00324 "wldrd wr12, [r5] \n\t"
00325 WAVG2B" wr0, wr0, wr4 \n\t"
00326 WAVG2B" wr2, wr2, wr6 \n\t"
00327 WAVG2B" wr0, wr0, wr10 \n\t"
00328 WAVG2B" wr2, wr2, wr12 \n\t"
00329 "wstrd wr0, [%[block]] \n\t"
00330 "subs %[h], %[h], #2 \n\t"
00331 "wstrd wr2, [r5] \n\t"
00332 "add %[block], %[block], %[line_size] \n\t"
00333 "add r5, r5, %[line_size] \n\t"
00334 "pld [%[block]] \n\t"
00335 "pld [%[block], #32] \n\t"
00336 "pld [r5] \n\t"
00337 "pld [r5, #32] \n\t"
00338 "bne 1b \n\t"
00339 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
00340 :
00341 : "r4", "r5", "r12", "memory");
00342 }
00343
00344 void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00345 {
00346 int stride = line_size;
00347
00348
00349 SET_RND(wr15);
00350 __asm__ __volatile__(
00351 "pld [%[pixels]] \n\t"
00352 "pld [%[pixels], #32] \n\t"
00353 "pld [%[block]] \n\t"
00354 "pld [%[block], #32] \n\t"
00355 "and r12, %[pixels], #7 \n\t"
00356 "bic %[pixels], %[pixels], #7 \n\t"
00357 "tmcr wcgr1, r12 \n\t"
00358 "add r12, r12, #1 \n\t"
00359 "add r4, %[pixels], %[line_size]\n\t"
00360 "tmcr wcgr2, r12 \n\t"
00361 "add r5, %[block], %[line_size] \n\t"
00362 "mov %[line_size], %[line_size], lsl #1 \n\t"
00363 "pld [r5] \n\t"
00364 "pld [r5, #32] \n\t"
00365
00366 "1: \n\t"
00367 "wldrd wr10, [%[pixels]] \n\t"
00368 "cmp r12, #8 \n\t"
00369 "wldrd wr11, [%[pixels], #8] \n\t"
00370 "wldrd wr12, [%[pixels], #16] \n\t"
00371 "add %[pixels], %[pixels], %[line_size] \n\t"
00372 "wldrd wr13, [r4] \n\t"
00373 "pld [%[pixels]] \n\t"
00374 "wldrd wr14, [r4, #8] \n\t"
00375 "pld [%[pixels], #32] \n\t"
00376 "wldrd wr15, [r4, #16] \n\t"
00377 "add r4, r4, %[line_size] \n\t"
00378 "walignr1 wr0, wr10, wr11 \n\t"
00379 "pld [r4] \n\t"
00380 "pld [r4, #32] \n\t"
00381 "walignr1 wr1, wr11, wr12 \n\t"
00382 "walignr1 wr2, wr13, wr14 \n\t"
00383 "walignr1 wr3, wr14, wr15 \n\t"
00384 "wmoveq wr4, wr11 \n\t"
00385 "wmoveq wr5, wr12 \n\t"
00386 "wmoveq wr6, wr14 \n\t"
00387 "wmoveq wr7, wr15 \n\t"
00388 "walignr2ne wr4, wr10, wr11 \n\t"
00389 "walignr2ne wr5, wr11, wr12 \n\t"
00390 "walignr2ne wr6, wr13, wr14 \n\t"
00391 "walignr2ne wr7, wr14, wr15 \n\t"
00392 "wldrd wr10, [%[block]] \n\t"
00393 WAVG2B" wr0, wr0, wr4 \n\t"
00394 "wldrd wr11, [%[block], #8] \n\t"
00395 WAVG2B" wr1, wr1, wr5 \n\t"
00396 "wldrd wr12, [r5] \n\t"
00397 WAVG2B" wr2, wr2, wr6 \n\t"
00398 "wldrd wr13, [r5, #8] \n\t"
00399 WAVG2B" wr3, wr3, wr7 \n\t"
00400 WAVG2B" wr0, wr0, wr10 \n\t"
00401 WAVG2B" wr1, wr1, wr11 \n\t"
00402 WAVG2B" wr2, wr2, wr12 \n\t"
00403 WAVG2B" wr3, wr3, wr13 \n\t"
00404 "wstrd wr0, [%[block]] \n\t"
00405 "subs %[h], %[h], #2 \n\t"
00406 "wstrd wr1, [%[block], #8] \n\t"
00407 "add %[block], %[block], %[line_size] \n\t"
00408 "wstrd wr2, [r5] \n\t"
00409 "pld [%[block]] \n\t"
00410 "wstrd wr3, [r5, #8] \n\t"
00411 "add r5, r5, %[line_size] \n\t"
00412 "pld [%[block], #32] \n\t"
00413 "pld [r5] \n\t"
00414 "pld [r5, #32] \n\t"
00415 "bne 1b \n\t"
00416 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
00417 :
00418 :"r4", "r5", "r12", "memory");
00419 }
00420
00421 void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00422 {
00423 int stride = line_size;
00424
00425
00426 __asm__ __volatile__(
00427 "pld [%[pixels]] \n\t"
00428 "pld [%[pixels], #32] \n\t"
00429 "and r12, %[pixels], #7 \n\t"
00430 "tmcr wcgr1, r12 \n\t"
00431 "bic %[pixels], %[pixels], #7 \n\t"
00432
00433 "wldrd wr10, [%[pixels]] \n\t"
00434 "wldrd wr11, [%[pixels], #8] \n\t"
00435 "pld [%[block]] \n\t"
00436 "add %[pixels], %[pixels], %[line_size] \n\t"
00437 "walignr1 wr0, wr10, wr11 \n\t"
00438 "pld [%[pixels]] \n\t"
00439 "pld [%[pixels], #32] \n\t"
00440
00441 "1: \n\t"
00442 "wldrd wr10, [%[pixels]] \n\t"
00443 "wldrd wr11, [%[pixels], #8] \n\t"
00444 "add %[pixels], %[pixels], %[line_size] \n\t"
00445 "pld [%[pixels]] \n\t"
00446 "pld [%[pixels], #32] \n\t"
00447 "walignr1 wr4, wr10, wr11 \n\t"
00448 "wldrd wr10, [%[block]] \n\t"
00449 WAVG2B" wr8, wr0, wr4 \n\t"
00450 WAVG2B" wr8, wr8, wr10 \n\t"
00451 "wstrd wr8, [%[block]] \n\t"
00452 "add %[block], %[block], %[line_size] \n\t"
00453
00454 "wldrd wr10, [%[pixels]] \n\t"
00455 "wldrd wr11, [%[pixels], #8] \n\t"
00456 "pld [%[block]] \n\t"
00457 "add %[pixels], %[pixels], %[line_size] \n\t"
00458 "pld [%[pixels]] \n\t"
00459 "pld [%[pixels], #32] \n\t"
00460 "walignr1 wr0, wr10, wr11 \n\t"
00461 "wldrd wr10, [%[block]] \n\t"
00462 WAVG2B" wr8, wr0, wr4 \n\t"
00463 WAVG2B" wr8, wr8, wr10 \n\t"
00464 "wstrd wr8, [%[block]] \n\t"
00465 "add %[block], %[block], %[line_size] \n\t"
00466
00467 "subs %[h], %[h], #2 \n\t"
00468 "pld [%[block]] \n\t"
00469 "bne 1b \n\t"
00470 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
00471 :
00472 : "cc", "memory", "r12");
00473 }
00474
00475 void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00476 {
00477 int stride = line_size;
00478
00479
00480 __asm__ __volatile__(
00481 "pld [%[pixels]] \n\t"
00482 "pld [%[pixels], #32] \n\t"
00483 "and r12, %[pixels], #7 \n\t"
00484 "tmcr wcgr1, r12 \n\t"
00485 "bic %[pixels], %[pixels], #7 \n\t"
00486
00487 "wldrd wr10, [%[pixels]] \n\t"
00488 "wldrd wr11, [%[pixels], #8] \n\t"
00489 "wldrd wr12, [%[pixels], #16] \n\t"
00490 "add %[pixels], %[pixels], %[line_size] \n\t"
00491 "pld [%[pixels]] \n\t"
00492 "pld [%[pixels], #32] \n\t"
00493 "walignr1 wr0, wr10, wr11 \n\t"
00494 "walignr1 wr1, wr11, wr12 \n\t"
00495
00496 "1: \n\t"
00497 "wldrd wr10, [%[pixels]] \n\t"
00498 "wldrd wr11, [%[pixels], #8] \n\t"
00499 "wldrd wr12, [%[pixels], #16] \n\t"
00500 "add %[pixels], %[pixels], %[line_size] \n\t"
00501 "pld [%[pixels]] \n\t"
00502 "pld [%[pixels], #32] \n\t"
00503 "walignr1 wr4, wr10, wr11 \n\t"
00504 "walignr1 wr5, wr11, wr12 \n\t"
00505 WAVG2B" wr8, wr0, wr4 \n\t"
00506 WAVG2B" wr9, wr1, wr5 \n\t"
00507 "wstrd wr8, [%[block]] \n\t"
00508 "wstrd wr9, [%[block], #8] \n\t"
00509 "add %[block], %[block], %[line_size] \n\t"
00510
00511 "wldrd wr10, [%[pixels]] \n\t"
00512 "wldrd wr11, [%[pixels], #8] \n\t"
00513 "wldrd wr12, [%[pixels], #16] \n\t"
00514 "add %[pixels], %[pixels], %[line_size] \n\t"
00515 "pld [%[pixels]] \n\t"
00516 "pld [%[pixels], #32] \n\t"
00517 "walignr1 wr0, wr10, wr11 \n\t"
00518 "walignr1 wr1, wr11, wr12 \n\t"
00519 WAVG2B" wr8, wr0, wr4 \n\t"
00520 WAVG2B" wr9, wr1, wr5 \n\t"
00521 "wstrd wr8, [%[block]] \n\t"
00522 "wstrd wr9, [%[block], #8] \n\t"
00523 "add %[block], %[block], %[line_size] \n\t"
00524
00525 "subs %[h], %[h], #2 \n\t"
00526 "bne 1b \n\t"
00527 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
00528 :
00529 : "r4", "r5", "r12", "memory");
00530 }
00531
00532 void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00533 {
00534 int stride = line_size;
00535
00536
00537 __asm__ __volatile__(
00538 "pld [%[pixels]] \n\t"
00539 "pld [%[pixels], #32] \n\t"
00540 "and r12, %[pixels], #7 \n\t"
00541 "tmcr wcgr1, r12 \n\t"
00542 "bic %[pixels], %[pixels], #7 \n\t"
00543
00544 "wldrd wr10, [%[pixels]] \n\t"
00545 "wldrd wr11, [%[pixels], #8] \n\t"
00546 "pld [%[block]] \n\t"
00547 "wldrd wr12, [%[pixels], #16] \n\t"
00548 "add %[pixels], %[pixels], %[line_size] \n\t"
00549 "pld [%[pixels]] \n\t"
00550 "pld [%[pixels], #32] \n\t"
00551 "walignr1 wr0, wr10, wr11 \n\t"
00552 "walignr1 wr1, wr11, wr12 \n\t"
00553
00554 "1: \n\t"
00555 "wldrd wr10, [%[pixels]] \n\t"
00556 "wldrd wr11, [%[pixels], #8] \n\t"
00557 "wldrd wr12, [%[pixels], #16] \n\t"
00558 "add %[pixels], %[pixels], %[line_size] \n\t"
00559 "pld [%[pixels]] \n\t"
00560 "pld [%[pixels], #32] \n\t"
00561 "walignr1 wr4, wr10, wr11 \n\t"
00562 "walignr1 wr5, wr11, wr12 \n\t"
00563 "wldrd wr10, [%[block]] \n\t"
00564 "wldrd wr11, [%[block], #8] \n\t"
00565 WAVG2B" wr8, wr0, wr4 \n\t"
00566 WAVG2B" wr9, wr1, wr5 \n\t"
00567 WAVG2B" wr8, wr8, wr10 \n\t"
00568 WAVG2B" wr9, wr9, wr11 \n\t"
00569 "wstrd wr8, [%[block]] \n\t"
00570 "wstrd wr9, [%[block], #8] \n\t"
00571 "add %[block], %[block], %[line_size] \n\t"
00572
00573 "wldrd wr10, [%[pixels]] \n\t"
00574 "wldrd wr11, [%[pixels], #8] \n\t"
00575 "pld [%[block]] \n\t"
00576 "wldrd wr12, [%[pixels], #16] \n\t"
00577 "add %[pixels], %[pixels], %[line_size] \n\t"
00578 "pld [%[pixels]] \n\t"
00579 "pld [%[pixels], #32] \n\t"
00580 "walignr1 wr0, wr10, wr11 \n\t"
00581 "walignr1 wr1, wr11, wr12 \n\t"
00582 "wldrd wr10, [%[block]] \n\t"
00583 "wldrd wr11, [%[block], #8] \n\t"
00584 WAVG2B" wr8, wr0, wr4 \n\t"
00585 WAVG2B" wr9, wr1, wr5 \n\t"
00586 WAVG2B" wr8, wr8, wr10 \n\t"
00587 WAVG2B" wr9, wr9, wr11 \n\t"
00588 "wstrd wr8, [%[block]] \n\t"
00589 "wstrd wr9, [%[block], #8] \n\t"
00590 "add %[block], %[block], %[line_size] \n\t"
00591
00592 "subs %[h], %[h], #2 \n\t"
00593 "pld [%[block]] \n\t"
00594 "bne 1b \n\t"
00595 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
00596 :
00597 : "r4", "r5", "r12", "memory");
00598 }
00599
00600 void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00601 {
00602
00603
00604 SET_RND(wr15);
00605 __asm__ __volatile__(
00606 "pld [%[pixels]] \n\t"
00607 "mov r12, #2 \n\t"
00608 "pld [%[pixels], #32] \n\t"
00609 "tmcr wcgr0, r12 \n\t"
00610 "and r12, %[pixels], #7 \n\t"
00611 "bic %[pixels], %[pixels], #7 \n\t"
00612 "tmcr wcgr1, r12 \n\t"
00613
00614
00615
00616 "wldrd wr12, [%[pixels]] \n\t"
00617 "add r12, r12, #1 \n\t"
00618 "wldrd wr13, [%[pixels], #8] \n\t"
00619 "tmcr wcgr2, r12 \n\t"
00620 "add %[pixels], %[pixels], %[line_size] \n\t"
00621 "cmp r12, #8 \n\t"
00622 "pld [%[pixels]] \n\t"
00623 "pld [%[pixels], #32] \n\t"
00624 "walignr1 wr2, wr12, wr13 \n\t"
00625 "wmoveq wr10, wr13 \n\t"
00626 "walignr2ne wr10, wr12, wr13 \n\t"
00627 "wunpckelub wr0, wr2 \n\t"
00628 "wunpckehub wr1, wr2 \n\t"
00629 "wunpckelub wr8, wr10 \n\t"
00630 "wunpckehub wr9, wr10 \n\t"
00631 "waddhus wr0, wr0, wr8 \n\t"
00632 "waddhus wr1, wr1, wr9 \n\t"
00633
00634 "1: \n\t"
00635
00636
00637 "wldrd wr12, [%[pixels]] \n\t"
00638 "cmp r12, #8 \n\t"
00639 "wldrd wr13, [%[pixels], #8] \n\t"
00640 "add %[pixels], %[pixels], %[line_size] \n\t"
00641 "walignr1 wr6, wr12, wr13 \n\t"
00642 "pld [%[pixels]] \n\t"
00643 "pld [%[pixels], #32] \n\t"
00644 "wmoveq wr10, wr13 \n\t"
00645 "walignr2ne wr10, wr12, wr13 \n\t"
00646 "wunpckelub wr4, wr6 \n\t"
00647 "wunpckehub wr5, wr6 \n\t"
00648 "wunpckelub wr8, wr10 \n\t"
00649 "wunpckehub wr9, wr10 \n\t"
00650 "waddhus wr4, wr4, wr8 \n\t"
00651 "waddhus wr5, wr5, wr9 \n\t"
00652 "waddhus wr8, wr0, wr4 \n\t"
00653 "waddhus wr9, wr1, wr5 \n\t"
00654 "waddhus wr8, wr8, wr15 \n\t"
00655 "waddhus wr9, wr9, wr15 \n\t"
00656 "wsrlhg wr8, wr8, wcgr0 \n\t"
00657 "wsrlhg wr9, wr9, wcgr0 \n\t"
00658 "wpackhus wr8, wr8, wr9 \n\t"
00659 "wstrd wr8, [%[block]] \n\t"
00660 "add %[block], %[block], %[line_size] \n\t"
00661
00662
00663
00664 "wldrd wr12, [%[pixels]] \n\t"
00665 "wldrd wr13, [%[pixels], #8] \n\t"
00666 "add %[pixels], %[pixels], %[line_size] \n\t"
00667 "walignr1 wr2, wr12, wr13 \n\t"
00668 "pld [%[pixels]] \n\t"
00669 "pld [%[pixels], #32] \n\t"
00670 "wmoveq wr10, wr13 \n\t"
00671 "walignr2ne wr10, wr12, wr13 \n\t"
00672 "wunpckelub wr0, wr2 \n\t"
00673 "wunpckehub wr1, wr2 \n\t"
00674 "wunpckelub wr8, wr10 \n\t"
00675 "wunpckehub wr9, wr10 \n\t"
00676 "waddhus wr0, wr0, wr8 \n\t"
00677 "waddhus wr1, wr1, wr9 \n\t"
00678 "waddhus wr8, wr0, wr4 \n\t"
00679 "waddhus wr9, wr1, wr5 \n\t"
00680 "waddhus wr8, wr8, wr15 \n\t"
00681 "waddhus wr9, wr9, wr15 \n\t"
00682 "wsrlhg wr8, wr8, wcgr0 \n\t"
00683 "wsrlhg wr9, wr9, wcgr0 \n\t"
00684 "wpackhus wr8, wr8, wr9 \n\t"
00685 "subs %[h], %[h], #2 \n\t"
00686 "wstrd wr8, [%[block]] \n\t"
00687 "add %[block], %[block], %[line_size] \n\t"
00688 "bne 1b \n\t"
00689 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
00690 : [line_size]"r"(line_size)
00691 : "r12", "memory");
00692 }
00693
00694 void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00695 {
00696
00697
00698 SET_RND(wr15);
00699 __asm__ __volatile__(
00700 "pld [%[pixels]] \n\t"
00701 "mov r12, #2 \n\t"
00702 "pld [%[pixels], #32] \n\t"
00703 "tmcr wcgr0, r12 \n\t"
00704
00705 "and r12, %[pixels], #7 \n\t"
00706 "bic %[pixels], %[pixels], #7 \n\t"
00707 "tmcr wcgr1, r12 \n\t"
00708 "add r12, r12, #1 \n\t"
00709 "tmcr wcgr2, r12 \n\t"
00710
00711
00712
00713 "wldrd wr12, [%[pixels]] \n\t"
00714 "cmp r12, #8 \n\t"
00715 "wldrd wr13, [%[pixels], #8] \n\t"
00716 "wldrd wr14, [%[pixels], #16] \n\t"
00717 "add %[pixels], %[pixels], %[line_size] \n\t"
00718 "pld [%[pixels]] \n\t"
00719 "walignr1 wr2, wr12, wr13 \n\t"
00720 "pld [%[pixels], #32] \n\t"
00721 "walignr1 wr3, wr13, wr14 \n\t"
00722 "wmoveq wr10, wr13 \n\t"
00723 "wmoveq wr11, wr14 \n\t"
00724 "walignr2ne wr10, wr12, wr13 \n\t"
00725 "walignr2ne wr11, wr13, wr14 \n\t"
00726 "wunpckelub wr0, wr2 \n\t"
00727 "wunpckehub wr1, wr2 \n\t"
00728 "wunpckelub wr2, wr3 \n\t"
00729 "wunpckehub wr3, wr3 \n\t"
00730 "wunpckelub wr8, wr10 \n\t"
00731 "wunpckehub wr9, wr10 \n\t"
00732 "wunpckelub wr10, wr11 \n\t"
00733 "wunpckehub wr11, wr11 \n\t"
00734 "waddhus wr0, wr0, wr8 \n\t"
00735 "waddhus wr1, wr1, wr9 \n\t"
00736 "waddhus wr2, wr2, wr10 \n\t"
00737 "waddhus wr3, wr3, wr11 \n\t"
00738
00739 "1: \n\t"
00740
00741
00742 "wldrd wr12, [%[pixels]] \n\t"
00743 "cmp r12, #8 \n\t"
00744 "wldrd wr13, [%[pixels], #8] \n\t"
00745 "wldrd wr14, [%[pixels], #16] \n\t"
00746 "add %[pixels], %[pixels], %[line_size] \n\t"
00747 "walignr1 wr6, wr12, wr13 \n\t"
00748 "pld [%[pixels]] \n\t"
00749 "pld [%[pixels], #32] \n\t"
00750 "walignr1 wr7, wr13, wr14 \n\t"
00751 "wmoveq wr10, wr13 \n\t"
00752 "wmoveq wr11, wr14 \n\t"
00753 "walignr2ne wr10, wr12, wr13 \n\t"
00754 "walignr2ne wr11, wr13, wr14 \n\t"
00755 "wunpckelub wr4, wr6 \n\t"
00756 "wunpckehub wr5, wr6 \n\t"
00757 "wunpckelub wr6, wr7 \n\t"
00758 "wunpckehub wr7, wr7 \n\t"
00759 "wunpckelub wr8, wr10 \n\t"
00760 "wunpckehub wr9, wr10 \n\t"
00761 "wunpckelub wr10, wr11 \n\t"
00762 "wunpckehub wr11, wr11 \n\t"
00763 "waddhus wr4, wr4, wr8 \n\t"
00764 "waddhus wr5, wr5, wr9 \n\t"
00765 "waddhus wr6, wr6, wr10 \n\t"
00766 "waddhus wr7, wr7, wr11 \n\t"
00767 "waddhus wr8, wr0, wr4 \n\t"
00768 "waddhus wr9, wr1, wr5 \n\t"
00769 "waddhus wr10, wr2, wr6 \n\t"
00770 "waddhus wr11, wr3, wr7 \n\t"
00771 "waddhus wr8, wr8, wr15 \n\t"
00772 "waddhus wr9, wr9, wr15 \n\t"
00773 "waddhus wr10, wr10, wr15 \n\t"
00774 "waddhus wr11, wr11, wr15 \n\t"
00775 "wsrlhg wr8, wr8, wcgr0 \n\t"
00776 "wsrlhg wr9, wr9, wcgr0 \n\t"
00777 "wsrlhg wr10, wr10, wcgr0 \n\t"
00778 "wsrlhg wr11, wr11, wcgr0 \n\t"
00779 "wpackhus wr8, wr8, wr9 \n\t"
00780 "wpackhus wr9, wr10, wr11 \n\t"
00781 "wstrd wr8, [%[block]] \n\t"
00782 "wstrd wr9, [%[block], #8] \n\t"
00783 "add %[block], %[block], %[line_size] \n\t"
00784
00785
00786
00787 "wldrd wr12, [%[pixels]] \n\t"
00788 "wldrd wr13, [%[pixels], #8] \n\t"
00789 "wldrd wr14, [%[pixels], #16] \n\t"
00790 "add %[pixels], %[pixels], %[line_size] \n\t"
00791 "walignr1 wr2, wr12, wr13 \n\t"
00792 "pld [%[pixels]] \n\t"
00793 "pld [%[pixels], #32] \n\t"
00794 "walignr1 wr3, wr13, wr14 \n\t"
00795 "wmoveq wr10, wr13 \n\t"
00796 "wmoveq wr11, wr14 \n\t"
00797 "walignr2ne wr10, wr12, wr13 \n\t"
00798 "walignr2ne wr11, wr13, wr14 \n\t"
00799 "wunpckelub wr0, wr2 \n\t"
00800 "wunpckehub wr1, wr2 \n\t"
00801 "wunpckelub wr2, wr3 \n\t"
00802 "wunpckehub wr3, wr3 \n\t"
00803 "wunpckelub wr8, wr10 \n\t"
00804 "wunpckehub wr9, wr10 \n\t"
00805 "wunpckelub wr10, wr11 \n\t"
00806 "wunpckehub wr11, wr11 \n\t"
00807 "waddhus wr0, wr0, wr8 \n\t"
00808 "waddhus wr1, wr1, wr9 \n\t"
00809 "waddhus wr2, wr2, wr10 \n\t"
00810 "waddhus wr3, wr3, wr11 \n\t"
00811 "waddhus wr8, wr0, wr4 \n\t"
00812 "waddhus wr9, wr1, wr5 \n\t"
00813 "waddhus wr10, wr2, wr6 \n\t"
00814 "waddhus wr11, wr3, wr7 \n\t"
00815 "waddhus wr8, wr8, wr15 \n\t"
00816 "waddhus wr9, wr9, wr15 \n\t"
00817 "waddhus wr10, wr10, wr15 \n\t"
00818 "waddhus wr11, wr11, wr15 \n\t"
00819 "wsrlhg wr8, wr8, wcgr0 \n\t"
00820 "wsrlhg wr9, wr9, wcgr0 \n\t"
00821 "wsrlhg wr10, wr10, wcgr0 \n\t"
00822 "wsrlhg wr11, wr11, wcgr0 \n\t"
00823 "wpackhus wr8, wr8, wr9 \n\t"
00824 "wpackhus wr9, wr10, wr11 \n\t"
00825 "wstrd wr8, [%[block]] \n\t"
00826 "wstrd wr9, [%[block], #8] \n\t"
00827 "add %[block], %[block], %[line_size] \n\t"
00828
00829 "subs %[h], %[h], #2 \n\t"
00830 "bne 1b \n\t"
00831 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
00832 : [line_size]"r"(line_size)
00833 : "r12", "memory");
00834 }
00835
00836 void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00837 {
00838
00839
00840 SET_RND(wr15);
00841 __asm__ __volatile__(
00842 "pld [%[block]] \n\t"
00843 "pld [%[block], #32] \n\t"
00844 "pld [%[pixels]] \n\t"
00845 "mov r12, #2 \n\t"
00846 "pld [%[pixels], #32] \n\t"
00847 "tmcr wcgr0, r12 \n\t"
00848 "and r12, %[pixels], #7 \n\t"
00849 "bic %[pixels], %[pixels], #7 \n\t"
00850 "tmcr wcgr1, r12 \n\t"
00851
00852
00853
00854 "wldrd wr12, [%[pixels]] \n\t"
00855 "add r12, r12, #1 \n\t"
00856 "wldrd wr13, [%[pixels], #8] \n\t"
00857 "tmcr wcgr2, r12 \n\t"
00858 "add %[pixels], %[pixels], %[line_size] \n\t"
00859 "cmp r12, #8 \n\t"
00860 "pld [%[pixels]] \n\t"
00861 "pld [%[pixels], #32] \n\t"
00862 "walignr1 wr2, wr12, wr13 \n\t"
00863 "wmoveq wr10, wr13 \n\t"
00864 "walignr2ne wr10, wr12, wr13 \n\t"
00865 "wunpckelub wr0, wr2 \n\t"
00866 "wunpckehub wr1, wr2 \n\t"
00867 "wunpckelub wr8, wr10 \n\t"
00868 "wunpckehub wr9, wr10 \n\t"
00869 "waddhus wr0, wr0, wr8 \n\t"
00870 "waddhus wr1, wr1, wr9 \n\t"
00871
00872 "1: \n\t"
00873
00874
00875 "wldrd wr12, [%[pixels]] \n\t"
00876 "cmp r12, #8 \n\t"
00877 "wldrd wr13, [%[pixels], #8] \n\t"
00878 "add %[pixels], %[pixels], %[line_size] \n\t"
00879 "walignr1 wr6, wr12, wr13 \n\t"
00880 "pld [%[pixels]] \n\t"
00881 "pld [%[pixels], #32] \n\t"
00882 "wmoveq wr10, wr13 \n\t"
00883 "walignr2ne wr10, wr12, wr13 \n\t"
00884 "wunpckelub wr4, wr6 \n\t"
00885 "wunpckehub wr5, wr6 \n\t"
00886 "wunpckelub wr8, wr10 \n\t"
00887 "wunpckehub wr9, wr10 \n\t"
00888 "waddhus wr4, wr4, wr8 \n\t"
00889 "waddhus wr5, wr5, wr9 \n\t"
00890 "waddhus wr8, wr0, wr4 \n\t"
00891 "waddhus wr9, wr1, wr5 \n\t"
00892 "waddhus wr8, wr8, wr15 \n\t"
00893 "waddhus wr9, wr9, wr15 \n\t"
00894 "wldrd wr12, [%[block]] \n\t"
00895 "wsrlhg wr8, wr8, wcgr0 \n\t"
00896 "wsrlhg wr9, wr9, wcgr0 \n\t"
00897 "wpackhus wr8, wr8, wr9 \n\t"
00898 WAVG2B" wr8, wr8, wr12 \n\t"
00899 "wstrd wr8, [%[block]] \n\t"
00900 "add %[block], %[block], %[line_size] \n\t"
00901 "wldrd wr12, [%[pixels]] \n\t"
00902 "pld [%[block]] \n\t"
00903 "pld [%[block], #32] \n\t"
00904
00905
00906
00907 "wldrd wr13, [%[pixels], #8] \n\t"
00908 "add %[pixels], %[pixels], %[line_size] \n\t"
00909 "walignr1 wr2, wr12, wr13 \n\t"
00910 "pld [%[pixels]] \n\t"
00911 "pld [%[pixels], #32] \n\t"
00912 "wmoveq wr10, wr13 \n\t"
00913 "walignr2ne wr10, wr12, wr13 \n\t"
00914 "wunpckelub wr0, wr2 \n\t"
00915 "wunpckehub wr1, wr2 \n\t"
00916 "wunpckelub wr8, wr10 \n\t"
00917 "wunpckehub wr9, wr10 \n\t"
00918 "waddhus wr0, wr0, wr8 \n\t"
00919 "waddhus wr1, wr1, wr9 \n\t"
00920 "waddhus wr8, wr0, wr4 \n\t"
00921 "waddhus wr9, wr1, wr5 \n\t"
00922 "waddhus wr8, wr8, wr15 \n\t"
00923 "waddhus wr9, wr9, wr15 \n\t"
00924 "wldrd wr12, [%[block]] \n\t"
00925 "wsrlhg wr8, wr8, wcgr0 \n\t"
00926 "wsrlhg wr9, wr9, wcgr0 \n\t"
00927 "wpackhus wr8, wr8, wr9 \n\t"
00928 "subs %[h], %[h], #2 \n\t"
00929 WAVG2B" wr8, wr8, wr12 \n\t"
00930 "wstrd wr8, [%[block]] \n\t"
00931 "add %[block], %[block], %[line_size] \n\t"
00932 "pld [%[block]] \n\t"
00933 "pld [%[block], #32] \n\t"
00934 "bne 1b \n\t"
00935 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
00936 : [line_size]"r"(line_size)
00937 : "r12", "memory");
00938 }
00939
00940 void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00941 {
00942
00943
00944 SET_RND(wr15);
00945 __asm__ __volatile__(
00946 "pld [%[block]] \n\t"
00947 "pld [%[block], #32] \n\t"
00948 "pld [%[pixels]] \n\t"
00949 "mov r12, #2 \n\t"
00950 "pld [%[pixels], #32] \n\t"
00951 "tmcr wcgr0, r12 \n\t"
00952
00953 "and r12, %[pixels], #7 \n\t"
00954 "bic %[pixels], %[pixels], #7 \n\t"
00955 "tmcr wcgr1, r12 \n\t"
00956 "add r12, r12, #1 \n\t"
00957 "tmcr wcgr2, r12 \n\t"
00958
00959
00960
00961 "wldrd wr12, [%[pixels]] \n\t"
00962 "cmp r12, #8 \n\t"
00963 "wldrd wr13, [%[pixels], #8] \n\t"
00964 "wldrd wr14, [%[pixels], #16] \n\t"
00965 "add %[pixels], %[pixels], %[line_size] \n\t"
00966 "pld [%[pixels]] \n\t"
00967 "walignr1 wr2, wr12, wr13 \n\t"
00968 "pld [%[pixels], #32] \n\t"
00969 "walignr1 wr3, wr13, wr14 \n\t"
00970 "wmoveq wr10, wr13 \n\t"
00971 "wmoveq wr11, wr14 \n\t"
00972 "walignr2ne wr10, wr12, wr13 \n\t"
00973 "walignr2ne wr11, wr13, wr14 \n\t"
00974 "wunpckelub wr0, wr2 \n\t"
00975 "wunpckehub wr1, wr2 \n\t"
00976 "wunpckelub wr2, wr3 \n\t"
00977 "wunpckehub wr3, wr3 \n\t"
00978 "wunpckelub wr8, wr10 \n\t"
00979 "wunpckehub wr9, wr10 \n\t"
00980 "wunpckelub wr10, wr11 \n\t"
00981 "wunpckehub wr11, wr11 \n\t"
00982 "waddhus wr0, wr0, wr8 \n\t"
00983 "waddhus wr1, wr1, wr9 \n\t"
00984 "waddhus wr2, wr2, wr10 \n\t"
00985 "waddhus wr3, wr3, wr11 \n\t"
00986
00987 "1: \n\t"
00988
00989
00990 "wldrd wr12, [%[pixels]] \n\t"
00991 "cmp r12, #8 \n\t"
00992 "wldrd wr13, [%[pixels], #8] \n\t"
00993 "wldrd wr14, [%[pixels], #16] \n\t"
00994 "add %[pixels], %[pixels], %[line_size] \n\t"
00995 "walignr1 wr6, wr12, wr13 \n\t"
00996 "pld [%[pixels]] \n\t"
00997 "pld [%[pixels], #32] \n\t"
00998 "walignr1 wr7, wr13, wr14 \n\t"
00999 "wmoveq wr10, wr13 \n\t"
01000 "wmoveq wr11, wr14 \n\t"
01001 "walignr2ne wr10, wr12, wr13 \n\t"
01002 "walignr2ne wr11, wr13, wr14 \n\t"
01003 "wunpckelub wr4, wr6 \n\t"
01004 "wunpckehub wr5, wr6 \n\t"
01005 "wunpckelub wr6, wr7 \n\t"
01006 "wunpckehub wr7, wr7 \n\t"
01007 "wunpckelub wr8, wr10 \n\t"
01008 "wunpckehub wr9, wr10 \n\t"
01009 "wunpckelub wr10, wr11 \n\t"
01010 "wunpckehub wr11, wr11 \n\t"
01011 "waddhus wr4, wr4, wr8 \n\t"
01012 "waddhus wr5, wr5, wr9 \n\t"
01013 "waddhus wr6, wr6, wr10 \n\t"
01014 "waddhus wr7, wr7, wr11 \n\t"
01015 "waddhus wr8, wr0, wr4 \n\t"
01016 "waddhus wr9, wr1, wr5 \n\t"
01017 "waddhus wr10, wr2, wr6 \n\t"
01018 "waddhus wr11, wr3, wr7 \n\t"
01019 "waddhus wr8, wr8, wr15 \n\t"
01020 "waddhus wr9, wr9, wr15 \n\t"
01021 "waddhus wr10, wr10, wr15 \n\t"
01022 "waddhus wr11, wr11, wr15 \n\t"
01023 "wsrlhg wr8, wr8, wcgr0 \n\t"
01024 "wsrlhg wr9, wr9, wcgr0 \n\t"
01025 "wldrd wr12, [%[block]] \n\t"
01026 "wldrd wr13, [%[block], #8] \n\t"
01027 "wsrlhg wr10, wr10, wcgr0 \n\t"
01028 "wsrlhg wr11, wr11, wcgr0 \n\t"
01029 "wpackhus wr8, wr8, wr9 \n\t"
01030 "wpackhus wr9, wr10, wr11 \n\t"
01031 WAVG2B" wr8, wr8, wr12 \n\t"
01032 WAVG2B" wr9, wr9, wr13 \n\t"
01033 "wstrd wr8, [%[block]] \n\t"
01034 "wstrd wr9, [%[block], #8] \n\t"
01035 "add %[block], %[block], %[line_size] \n\t"
01036
01037
01038
01039 "wldrd wr12, [%[pixels]] \n\t"
01040 "pld [%[block]] \n\t"
01041 "wldrd wr13, [%[pixels], #8] \n\t"
01042 "pld [%[block], #32] \n\t"
01043 "wldrd wr14, [%[pixels], #16] \n\t"
01044 "add %[pixels], %[pixels], %[line_size] \n\t"
01045 "walignr1 wr2, wr12, wr13 \n\t"
01046 "pld [%[pixels]] \n\t"
01047 "pld [%[pixels], #32] \n\t"
01048 "walignr1 wr3, wr13, wr14 \n\t"
01049 "wmoveq wr10, wr13 \n\t"
01050 "wmoveq wr11, wr14 \n\t"
01051 "walignr2ne wr10, wr12, wr13 \n\t"
01052 "walignr2ne wr11, wr13, wr14 \n\t"
01053 "wunpckelub wr0, wr2 \n\t"
01054 "wunpckehub wr1, wr2 \n\t"
01055 "wunpckelub wr2, wr3 \n\t"
01056 "wunpckehub wr3, wr3 \n\t"
01057 "wunpckelub wr8, wr10 \n\t"
01058 "wunpckehub wr9, wr10 \n\t"
01059 "wunpckelub wr10, wr11 \n\t"
01060 "wunpckehub wr11, wr11 \n\t"
01061 "waddhus wr0, wr0, wr8 \n\t"
01062 "waddhus wr1, wr1, wr9 \n\t"
01063 "waddhus wr2, wr2, wr10 \n\t"
01064 "waddhus wr3, wr3, wr11 \n\t"
01065 "waddhus wr8, wr0, wr4 \n\t"
01066 "waddhus wr9, wr1, wr5 \n\t"
01067 "waddhus wr10, wr2, wr6 \n\t"
01068 "waddhus wr11, wr3, wr7 \n\t"
01069 "waddhus wr8, wr8, wr15 \n\t"
01070 "waddhus wr9, wr9, wr15 \n\t"
01071 "waddhus wr10, wr10, wr15 \n\t"
01072 "waddhus wr11, wr11, wr15 \n\t"
01073 "wsrlhg wr8, wr8, wcgr0 \n\t"
01074 "wsrlhg wr9, wr9, wcgr0 \n\t"
01075 "wldrd wr12, [%[block]] \n\t"
01076 "wldrd wr13, [%[block], #8] \n\t"
01077 "wsrlhg wr10, wr10, wcgr0 \n\t"
01078 "wsrlhg wr11, wr11, wcgr0 \n\t"
01079 "wpackhus wr8, wr8, wr9 \n\t"
01080 "wpackhus wr9, wr10, wr11 \n\t"
01081 WAVG2B" wr8, wr8, wr12 \n\t"
01082 WAVG2B" wr9, wr9, wr13 \n\t"
01083 "wstrd wr8, [%[block]] \n\t"
01084 "wstrd wr9, [%[block], #8] \n\t"
01085 "add %[block], %[block], %[line_size] \n\t"
01086 "subs %[h], %[h], #2 \n\t"
01087 "pld [%[block]] \n\t"
01088 "pld [%[block], #32] \n\t"
01089 "bne 1b \n\t"
01090 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
01091 : [line_size]"r"(line_size)
01092 : "r12", "memory");
01093 }