Dinesh K Garg | 0fdb337 | 2010-12-28 15:43:58 -0800 | [diff] [blame^] | 1 | #; Copyright (c) 2010-2011, Code Aurora Forum. All rights reserved. |
| 2 | #; |
| 3 | #; Redistribution and use in source and binary forms, with or without |
| 4 | #; modification, are permitted provided that the following conditions are |
| 5 | #; met: |
| 6 | #; * Redistributions of source code must retain the above copyright |
| 7 | #; notice, this list of conditions and the following disclaimer. |
| 8 | #; * Redistributions in binary form must reproduce the above |
| 9 | #; copyright notice, this list of conditions and the following |
| 10 | #; disclaimer in the documentation and/or other materials provided |
| 11 | #; with the distribution. |
| 12 | #; * Neither the name of Code Aurora Forum, Inc. nor the names of its |
| 13 | #; contributors may be used to endorse or promote products derived |
| 14 | #; from this software without specific prior written permission. |
| 15 | #; |
| 16 | #; THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED |
| 17 | #; WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF |
| 18 | #; MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT |
| 19 | #; ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS |
| 20 | #; BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| 21 | #; CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| 22 | #; SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR |
| 23 | #; BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, |
| 24 | #; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE |
| 25 | #; OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN |
| 26 | #; IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 27 | |
| 28 | #;============================================================================== |
| 29 | |
| 30 | .code 32 @; Code is ARM ISA |
| 31 | #;============================================================================== |
| 32 | |
| 33 | .global png_read_filter_row_neon |
| 34 | |
| 35 | #;============================================================================== |
| 36 | #; INPUTS: r0 rowbytes: number of bytes in current row |
| 37 | #; r1 pixel_depth: number of bits per pixel |
| 38 | #; r2 row: pointer to start of current row |
| 39 | #; r3 prev_row: pointer to start of previous row |
| 40 | #; [sp,#0] filter: filter type |
| 41 | #; |
| 42 | #; NOTE: Don't touch r5-r11 |
| 43 | #;============================================================================== |
| 44 | .balign 32 |
| 45 | .type png_read_filter_row_neon, %function |
| 46 | png_read_filter_row_neon: |
| 47 | |
| 48 | ldr r12,[sp,#0] |
| 49 | |
| 50 | cmp r12,#0 |
| 51 | beq DONE |
| 52 | |
| 53 | cmp r12,#1 |
| 54 | beq sub_filter |
| 55 | |
| 56 | cmp r12,#2 |
| 57 | beq up_filter |
| 58 | |
| 59 | cmp r12,#3 |
| 60 | beq avg_filter |
| 61 | |
| 62 | cmp r12,#4 |
| 63 | beq paeth_filter |
| 64 | |
| 65 | b DONE |
| 66 | |
| 67 | #;; --------------- |
| 68 | #;; SUB filter type |
| 69 | #;; --------------- |
| 70 | |
| 71 | |
| 72 | sub_filter: |
| 73 | |
| 74 | stmdb sp!, {r4} |
| 75 | |
| 76 | add r1,r1,#7 @; bpp = bytes per pixel |
| 77 | lsr r1,r1,#3 @; = (pixel_depth + 7) >> 3 |
| 78 | mov r12,r1 |
| 79 | |
| 80 | #;; r0 = rowbytes |
| 81 | #;; r1 = loop counter = bpp (initially) |
| 82 | #;; r2 = row pointer |
| 83 | #;; r12 = bpp = loop/pointer increment value |
| 84 | |
| 85 | cmp r1,r0 |
| 86 | beq sub_filter_exit @; exit if bpp == rowbytes |
| 87 | |
| 88 | cmp r12,#1 |
| 89 | beq sub_filter_1bpp |
| 90 | |
| 91 | cmp r12,#2 |
| 92 | beq sub_filter_2bpp |
| 93 | |
| 94 | cmp r12,#3 |
| 95 | beq sub_filter_3bpp |
| 96 | |
| 97 | cmp r12,#4 |
| 98 | beq sub_filter_4bpp |
| 99 | |
| 100 | cmp r12,#6 |
| 101 | beq sub_filter_6bpp |
| 102 | |
| 103 | cmp r12,#8 |
| 104 | beq sub_filter_8bpp |
| 105 | |
| 106 | sub_filter_exit: |
| 107 | b sub_filter_DONE @; return |
| 108 | |
| 109 | |
| 110 | sub_filter_1bpp: |
| 111 | |
| 112 | #;; ---------------------------- |
| 113 | #;; SUB filter, 1 byte per pixel |
| 114 | #;; ---------------------------- |
| 115 | |
| 116 | lsrs r4,r0,#4 @; r1 = floor(rowbytes/4) |
| 117 | @; = iteration count for loop16 |
| 118 | beq sub_filter_1bpp_16bytes_done |
| 119 | |
| 120 | vmov.i8 d21, #0 |
| 121 | vld1.8 {d16,d17}, [r2] @; load 16 pixels |
| 122 | @; d16 = a b c d e f g h |
| 123 | @; d17 = i j k l m n o p |
| 124 | |
| 125 | mov r1, #0 |
| 126 | sub_filter_1bpp_16bytes: |
| 127 | |
| 128 | |
| 129 | |
| 130 | |
| 131 | vshl.i64 d18, d16, #8 @; d18 = 0 a b c d e f g |
| 132 | vadd.i8 d18, d16, d18 @; d18 = a a+b b+c c+d d+e e+f f+g g+h |
| 133 | |
| 134 | vshl.i64 d18, d18, #8 @; d18 = 0 a a+b b+c c+d d+e e+f f+g |
| 135 | vadd.i8 d18, d16, d18 @; d18 = a a+b a+b+c b+c+d c+d+e d+e+f e+f+g f+g+h |
| 136 | |
| 137 | vshl.i64 d18, d18, #8 @; shift add continuously to propage the sum of previous |
| 138 | vadd.i8 d18, d16, d18 @; and current pixels |
| 139 | |
| 140 | vshl.i64 d18, d18, #8 |
| 141 | vadd.i8 d18, d16, d18 |
| 142 | |
| 143 | vshl.i64 d18, d18, #8 |
| 144 | vadd.i8 d18, d16, d18 |
| 145 | |
| 146 | vshl.i64 d18, d18, #8 |
| 147 | vadd.i8 d18, d16, d18 |
| 148 | |
| 149 | vshl.i64 d18, d18, #8 |
| 150 | vadd.i8 d18, d16, d18 @; maximum data size for shift is 64 bits i.e. doubleword. |
| 151 | @; after computing thh value of all the pixels in the double word |
| 152 | @; extract the last computed value which will be used by |
| 153 | @; the next set of pixels (i.e next doubleword) |
| 154 | vext.8 d22, d18, d21, #7 @; extract the updated value of d18[7] i.e a+b+c+d+e+f+h |
| 155 | vadd.i8 d17, d17, d22 @; d17 = a+b+c+d+e+f+g+h+i j k l m n o p |
| 156 | |
| 157 | vshl.i64 d19, d17, #8 @; continue shift-add as the first half |
| 158 | vadd.i8 d19, d17, d19 |
| 159 | |
| 160 | vshl.i64 d19, d19, #8 |
| 161 | vadd.i8 d19, d17, d19 |
| 162 | |
| 163 | vshl.i64 d19, d19, #8 |
| 164 | vadd.i8 d19, d17, d19 |
| 165 | |
| 166 | vshl.i64 d19, d19, #8 |
| 167 | vadd.i8 d19, d17, d19 |
| 168 | |
| 169 | vshl.i64 d19, d19, #8 |
| 170 | vadd.i8 d19, d17, d19 |
| 171 | |
| 172 | vshl.i64 d19, d19, #8 |
| 173 | vadd.i8 d19, d17, d19 |
| 174 | |
| 175 | vshl.i64 d19, d19, #8 |
| 176 | vadd.i8 d19, d17, d19 |
| 177 | |
| 178 | vst1.8 {d18,d19},[r2]! @; store the result back |
| 179 | |
| 180 | add r1, r1, #16 @; add 16 to the loop counter(no of bytes completed) |
| 181 | subs r4,r4,#1 @; decrement iteration count |
| 182 | beq sub_filter_1bpp_16bytes_adjust |
| 183 | |
| 184 | |
| 185 | vext.8 d22, d19, d21, #7 @; more iterations to go |
| 186 | @; extract the last computed value |
| 187 | vld1.8 {d16,d17}, [r2] @; load the next 16 bytes |
| 188 | vadd.i8 d16, d16, d22 @; set up the input by adding the previous pixel |
| 189 | @; value to the input |
| 190 | b sub_filter_1bpp_16bytes |
| 191 | |
| 192 | sub_filter_1bpp_16bytes_adjust: |
| 193 | |
| 194 | cmp r1, r0 @; no more pixels left .. exit |
| 195 | sub r2, r2, #1 @; more pixels remaining |
| 196 | @; r2 points to the current pixel adjust it |
| 197 | @; so that it points to the prev pixel for the below loop |
| 198 | beq sub_filter_DONE |
| 199 | |
| 200 | sub_filter_1bpp_16bytes_done: |
| 201 | |
| 202 | |
| 203 | vld1.8 {d0[0]},[r2]! @; load 1 byte (1 pixel) into D0[0] |
| 204 | @; increment row pointer |
| 205 | sub_filter_1bpp_loop: |
| 206 | add r1,r1,r12 @; loop counter += bpp |
| 207 | cmp r1,r0 @; |
| 208 | |
| 209 | vld1.8 {d2[0]},[r2] @; load 1 byte (current pixel) into D2[0] |
| 210 | |
| 211 | vadd.i8 d0,d0,d2 @; vector add 1 byte of previous pixel with |
| 212 | @; 1 byte of current pixel |
| 213 | vst1.8 {d0[0]},[r2]! @; store 1 byte (updated pixel) back |
| 214 | @; into row pointer location and increment |
| 215 | @; row pointer |
| 216 | |
| 217 | bne sub_filter_1bpp_loop @; loop back until loop counter == rowbytes |
| 218 | |
| 219 | b sub_filter_DONE @; return |
| 220 | |
| 221 | #;; ----------------------------- |
| 222 | #;; SUB filter, 2 bytes per pixel |
| 223 | #;; ----------------------------- |
| 224 | sub_filter_2bpp: |
| 225 | |
| 226 | lsrs r4,r0,#4 @; r1 = floor(rowbytes/4) |
| 227 | @; = iteration count for loop16 |
| 228 | beq sub_filter_2bpp_16bytes_done |
| 229 | |
| 230 | vmov.i8 d21, #0 |
| 231 | vld1.8 {d16,d17}, [r2] @; load 16 bytes to q8 |
| 232 | @; d16 = a b c d e f g h |
| 233 | @; d17 = i j k l m n o p |
| 234 | mov r1, #0 |
| 235 | sub_filter_2bpp_16bytes: |
| 236 | |
| 237 | vshl.i64 d18, d16, #16 @; each pixel is 2bytes .. shift by 16 bits to get previous pixel |
| 238 | vadd.i8 d18, d16, d18 @; add to the current pixel |
| 239 | |
| 240 | vshl.i64 d18, d18, #16 @; shift-add to propagate the computed sum as the case for 1bpp |
| 241 | vadd.i8 d18, d16, d18 |
| 242 | |
| 243 | vshl.i64 d18, d18, #16 |
| 244 | vadd.i8 d18, d16, d18 |
| 245 | |
| 246 | |
| 247 | vext.8 d22, d18, d21, #6 @; extract the last computed value (i.e. last 2 bytes) |
| 248 | vadd.i8 d17, d17, d22 @; add the last computed pixel to the input |
| 249 | |
| 250 | vshl.i64 d19, d17, #16 |
| 251 | vadd.i8 d19, d17, d19 |
| 252 | |
| 253 | vshl.i64 d19, d19, #16 |
| 254 | vadd.i8 d19, d17, d19 |
| 255 | |
| 256 | vshl.i64 d19, d19, #16 |
| 257 | vadd.i8 d19, d17, d19 |
| 258 | |
| 259 | |
| 260 | vst1.8 {d18,d19},[r2]! @; store the result back |
| 261 | |
| 262 | |
| 263 | add r1, r1, #16 @; add 16 to the loop counter(no of bytes completed) |
| 264 | subs r4,r4,#1 @; decrement iteration count |
| 265 | beq sub_filter_2bpp_16bytes_adjust |
| 266 | |
| 267 | |
| 268 | vext.8 d22, d19, d21, #6 @; extract the last computed value |
| 269 | @; add the last computed pixel to the input |
| 270 | vld1.8 {d16,d17}, [r2] |
| 271 | vadd.i8 d16, d16, d22 |
| 272 | |
| 273 | b sub_filter_2bpp_16bytes |
| 274 | |
| 275 | |
| 276 | sub_filter_2bpp_16bytes_adjust: |
| 277 | |
| 278 | cmp r1, r0 @; no more pixels left .. exit |
| 279 | sub r2, r2, #2 @; more pixels remaining |
| 280 | @; r2 points to the current pixel adjust it |
| 281 | @; so that it points to the prev pixel for the below loop |
| 282 | beq sub_filter_DONE |
| 283 | |
| 284 | sub_filter_2bpp_16bytes_done: |
| 285 | |
| 286 | vld1.16 {d0[0]},[r2]! @; load 2 bytes (1 pixel) into D0[0] |
| 287 | @; increment row pointer |
| 288 | sub_filter_2bpp_loop: |
| 289 | add r1,r1,r12 @; loop counter += bpp |
| 290 | cmp r1,r0 @; |
| 291 | |
| 292 | vld1.16 {d2[0]},[r2] @; load 2 bytes (current pixel) into D2[0] |
| 293 | vadd.i8 d0,d0,d2 @; vector add 2 bytes of previous pixel with |
| 294 | @; 2 bytes of current pixel |
| 295 | vst1.16 {d0[0]},[r2]! @; store 2 bytes (updated pixel) back |
| 296 | @; into row pointer location and increment |
| 297 | @; row pointer |
| 298 | |
| 299 | bne sub_filter_2bpp_loop @; loop back until loop counter == rowbytes |
| 300 | @ |
| 301 | b sub_filter_DONE @ ; return |
| 302 | |
| 303 | #;; ----------------------------- |
| 304 | #;; SUB filter, 3 bytes per pixel |
| 305 | #;; ----------------------------- |
| 306 | sub_filter_3bpp: |
| 307 | vld1.32 {d0[0]},[r2], r12 @; load 4 bytes (1 pixel + 1 extra byte) into D0[0] |
| 308 | @; increment row pointer by bpp |
| 309 | sub_filter_3bpp_loop: |
| 310 | add r1,r1,r12 @; loop counter += bpp |
| 311 | cmp r1,r0 @; |
| 312 | |
| 313 | vld1.32 {d2[0]},[r2] @; load 4 bytes (current pixel + 1 extra byte) into D2[0] |
| 314 | vadd.i8 d0,d0,d2 @; vector add 3 bytes of previous pixel with |
| 315 | @; 3 bytes of current pixel |
| 316 | vst1.16 {d0[0]},[r2]! @; store 2 bytes (updated pixel) back |
| 317 | @; into row pointer location and increment |
| 318 | @; row pointer |
| 319 | vst1.8 {d0[2]},[r2]! @; store 1 byte (updated pixel) back |
| 320 | @; into row pointer location and increment |
| 321 | @; row pointer |
| 322 | |
| 323 | bne sub_filter_3bpp_loop @; loop back until loop counter == rowbytes |
| 324 | |
| 325 | b sub_filter_DONE @; return |
| 326 | |
| 327 | #;; ----------------------------- |
| 328 | #;; SUB filter, 4 bytes per pixel |
| 329 | #;; ----------------------------- |
| 330 | sub_filter_4bpp: |
| 331 | vld1.32 {d0[0]},[r2]! @; load 4 bytes (1 pixel) into D0[0] |
| 332 | @; increment row pointer |
| 333 | sub_filter_4bpp_loop: @ |
| 334 | add r1,r1,r12 @; loop counter += bpp |
| 335 | cmp r1,r0 @; |
| 336 | |
| 337 | |
| 338 | vld1.32 {d2[0]},[r2] @; load 4 bytes (current pixel) into D2[0] |
| 339 | vadd.i8 d0,d0,d2 @; vector add 4 bytes of previous pixel with |
| 340 | @; 4 bytes of current pixel |
| 341 | vst1.32 {d0[0]},[r2]! @; store 4 bytes (updated pixel) back |
| 342 | @; into row pointer location and increment |
| 343 | @; row pointer |
| 344 | |
| 345 | bne sub_filter_4bpp_loop @; loop back until loop counter == rowbytes |
| 346 | |
| 347 | b sub_filter_DONE @; return |
| 348 | |
| 349 | #;; ----------------------------- |
| 350 | #;; SUB filter, 6 bytes per pixel |
| 351 | #;; ----------------------------- |
| 352 | sub_filter_6bpp: |
| 353 | vld1.8 {d0},[r2],r12 @; load 8 bytes (1 pixel + 2 extra bytes) into D0 |
| 354 | @; increment row pointer by bpp |
| 355 | sub_filter_6bpp_loop: @ |
| 356 | add r1,r1,r12 @; loop counter += bpp |
| 357 | cmp r1,r0 @; |
| 358 | |
| 359 | vld1.8 {d2},[r2] @; load 8 bytes (1 pixel + 2 extra bytes) into D2 |
| 360 | vadd.i8 d0,d0,d2 @; vector add 6 bytes of previous pixel with |
| 361 | @; 6 bytes of current pixel |
| 362 | vst1.32 {d0[0]},[r2]! @; store 4 bytes (updated pixel) back |
| 363 | @; into row pointer location and increment |
| 364 | @; row pointer |
| 365 | vst1.16 {d0[2]},[r2]! @; store 2 bytes (updated pixel) back |
| 366 | @; into row pointer location and increment |
| 367 | @; row pointer |
| 368 | |
| 369 | bne sub_filter_6bpp_loop @; loop back until loop counter == rowbytes |
| 370 | |
| 371 | b sub_filter_DONE @; return |
| 372 | |
| 373 | #;; ----------------------------- |
| 374 | #;; SUB filter, 8 bytes per pixel |
| 375 | #;; ----------------------------- |
| 376 | sub_filter_8bpp: |
| 377 | vld1.8 {d0},[r2]! @; load 8 bytes (1 pixel) into D0 |
| 378 | @; increment row pointer |
| 379 | sub_filter_8bpp_loop: @ |
| 380 | add r1,r1,r12 @; loop counter += bpp |
| 381 | cmp r1,r0 @; |
| 382 | vld1.8 {d2},[r2] @; load 8 bytes (current pixel) into D2 |
| 383 | vadd.i8 d0,d0,d2 @; vector add 8 bytes of previous pixel with |
| 384 | @; 8 bytes of current pixel |
| 385 | vst1.8 {d0},[r2]! @; store 8 bytes (updated pixel) back |
| 386 | @; into row pointer location and increment |
| 387 | @; row pointer |
| 388 | |
| 389 | |
| 390 | bne sub_filter_8bpp_loop @; loop back until loop counter == rowbytes |
| 391 | @ |
| 392 | b sub_filter_DONE @ ; return |
| 393 | |
| 394 | sub_filter_DONE: |
| 395 | |
| 396 | ldmia sp!, {r4} |
| 397 | bx r14 |
| 398 | |
| 399 | #;; -------------- |
| 400 | #;; UP filter type |
| 401 | #;; -------------- |
| 402 | up_filter: |
| 403 | |
| 404 | #;; r0 = rowbytes |
| 405 | #;; r1 = pixel_depth (not required for UP filter type) |
| 406 | #;; r2 = row pointer |
| 407 | #;; r3 = previous row pointer |
| 408 | |
| 409 | |
| 410 | lsrs r1,r0,#5 @; r1 = floor(rowbytes/32) |
| 411 | @; = iteration count for loop32 |
| 412 | beq up_filter_32bytes_proc_done |
| 413 | |
| 414 | |
| 415 | up_filter_32bytes_proc: |
| 416 | |
| 417 | |
| 418 | mov r12, r2 |
| 419 | |
| 420 | vld1.8 {q0},[r3]! @; load 32 bytes from previous |
| 421 | vld1.8 {q2},[r3]! @; row and increment pointer |
| 422 | @ |
| 423 | @ |
| 424 | vld1.8 {q1},[r12]! @; load 32 bytes from current row |
| 425 | vld1.8 {q3},[r12]! @ |
| 426 | @ |
| 427 | @ |
| 428 | @ |
| 429 | vadd.i8 q0,q0,q1 @; vector add of 16 bytes |
| 430 | vadd.i8 q2,q2,q3 @ |
| 431 | @ |
| 432 | @ |
| 433 | @ |
| 434 | vst1.8 {q0},[r2]! @; store 32 bytes to current row |
| 435 | vst1.8 {q2},[r2]! @ |
| 436 | @; and increment pointer |
| 437 | sub r0,r0,#32 @; subtract 32 from rowbytes |
| 438 | subs r1,r1,#1 @; decrement iteration count |
| 439 | bne up_filter_32bytes_proc |
| 440 | |
| 441 | |
| 442 | |
| 443 | up_filter_32bytes_proc_done: |
| 444 | |
| 445 | lsrs r1,r0,#4 @; r1 = floor(rowbytes/16) |
| 446 | @; = iteration count for loop16 |
| 447 | beq up_filter_16bytes_proc_done |
| 448 | |
| 449 | up_filter_16bytes_proc: |
| 450 | |
| 451 | vld1.8 {q0},[r3]! @; load 16 bytes from previous |
| 452 | @; row and increment pointer |
| 453 | vld1.8 {q1},[r2] @; load 16 bytes from current row |
| 454 | vadd.i8 q0,q0,q1 @; vector add of 16 bytes |
| 455 | vst1.8 {q0},[r2]! @; store 16 bytes to current row |
| 456 | @; and increment pointer |
| 457 | sub r0,r0,#16 @; subtract 16 from rowbytes |
| 458 | subs r1,r1,#1 @; decrement iteration count |
| 459 | bne up_filter_16bytes_proc |
| 460 | |
| 461 | up_filter_16bytes_proc_done: |
| 462 | |
| 463 | lsrs r1,r0,#3 @; r1 = floor(rowbytes/8) |
| 464 | beq up_filter_8bytes_proc_done |
| 465 | |
| 466 | up_filter_8bytes_proc: |
| 467 | |
| 468 | vld1.8 {d0},[r3]! @; load 8 bytes from previous |
| 469 | @; row and increment pointer |
| 470 | vld1.8 {d2},[r2] @; load 8 bytes from current row |
| 471 | vadd.i8 d0,d0,d2 @; vector add 8 bytes |
| 472 | vst1.8 {d0},[r2]! @; store 8 bytes to current row |
| 473 | @; and increment pointer |
| 474 | sub r0,r0,#8 @; subtract 8 from rowbytes |
| 475 | |
| 476 | up_filter_8bytes_proc_done: |
| 477 | |
| 478 | lsrs r1,r0,#2 @; r1 = floor(rowbytes/4) |
| 479 | beq up_filter_4bytes_proc_done |
| 480 | |
| 481 | up_filter_4bytes_proc: |
| 482 | |
| 483 | vld1.32 {d0[0]},[r3]! @; load 4 bytes from previous row |
| 484 | @; and increment pointer |
| 485 | vld1.32 {d2[0]},[r2] @; load 4 bytes from current row |
| 486 | vadd.i8 d0,d0,d2 @; vector add 4 bytes |
| 487 | vst1.32 {d0[0]},[r2]! @; store 4 bytes to current row |
| 488 | @; and increment pointer |
| 489 | sub r0,r0,#4 @; subtract 4 from rowbytes |
| 490 | |
| 491 | up_filter_4bytes_proc_done: |
| 492 | |
| 493 | lsrs r1,r0,#1 @; r1 = floor(rowbytes/2) |
| 494 | beq up_filter_2bytes_proc_done |
| 495 | |
| 496 | up_filter_2bytes_proc: |
| 497 | |
| 498 | vld1.16 {d0[0]},[r3]! @; load 2 bytes from previous row |
| 499 | @; and increment pointer |
| 500 | vld1.16 {d2[0]},[r2] @; load 2 bytes from current row |
| 501 | vadd.i8 d0,d0,d2 @; vector add 2 bytes |
| 502 | vst1.16 {d0[0]},[r2]! @; store 2 bytes to current row |
| 503 | @; and increment pointer |
| 504 | sub r0,r0,#2 @; subtract 2 from rowbytes |
| 505 | |
| 506 | up_filter_2bytes_proc_done: |
| 507 | |
| 508 | cmp r0,#0 |
| 509 | beq up_filter_1byte_proc_done |
| 510 | |
| 511 | up_filter_1byte_proc: |
| 512 | |
| 513 | vld1.8 {d0[0]},[r3]! @; load 1 byte from previous row |
| 514 | @; and increment pointer |
| 515 | vld1.8 {d2[0]},[r2] @; load 1 byte from current row |
| 516 | vadd.i8 d0,d0,d2 @; vector add 1 byte |
| 517 | vst1.8 {d0[0]},[r2]! @; store 1 byte to current row |
| 518 | @; and increment pointer |
| 519 | up_filter_1byte_proc_done: |
| 520 | |
| 521 | b DONE |
| 522 | |
| 523 | #;; --------------- |
| 524 | #;; AVG filter type |
| 525 | #;; --------------- |
| 526 | avg_filter: |
| 527 | |
| 528 | add r1,r1,#7 @; bpp = byptes per pixel |
| 529 | lsr r1,r1,#3 @; = (pixel_depth + 7) >> 3 |
| 530 | mov r12,r1 |
| 531 | |
| 532 | #;; r0 = rowbytes |
| 533 | #;; r1 = loop counter = bpp (initially) |
| 534 | #;; r2 = row pointer |
| 535 | #;; r3 = previous row pointer |
| 536 | #;; r12 = bpp = loop/pointer increment value |
| 537 | |
| 538 | cmp r12,#1 |
| 539 | beq avg_filter_1bpp |
| 540 | |
| 541 | cmp r12,#2 |
| 542 | beq avg_filter_2bpp |
| 543 | |
| 544 | cmp r12,#3 |
| 545 | beq avg_filter_3bpp |
| 546 | |
| 547 | cmp r12,#4 |
| 548 | beq avg_filter_4bpp |
| 549 | |
| 550 | cmp r12,#6 |
| 551 | beq avg_filter_6bpp |
| 552 | |
| 553 | cmp r12,#8 |
| 554 | beq avg_filter_8bpp |
| 555 | |
| 556 | avg_filter_exit: |
| 557 | b DONE @; return |
| 558 | |
| 559 | #;; ---------------------------- |
| 560 | #;; AVG filter, 1 byte per pixel |
| 561 | #;; ---------------------------- |
| 562 | avg_filter_1bpp: |
| 563 | |
| 564 | cmp r1,r0 |
| 565 | |
| 566 | vld1.8 {d0[0]},[r2] @; load 1 byte (pixel x) from curr |
| 567 | @; row into d0[0] |
| 568 | vld1.8 {d1[0]},[r3]! @; load 1 byte (pixel b) from prev |
| 569 | @; row into d1[0] |
| 570 | @; increment prev row pointer |
| 571 | vsra.u8 d0,d1,#1 @; shift right pixel b by 1 and add |
| 572 | @; to pixel x |
| 573 | vst1.8 {d0[0]},[r2]! @; store 1 byte (updated pixel x) |
| 574 | @; increment curr row pointer |
| 575 | @; updated pixel x is now pixel a |
| 576 | beq DONE |
| 577 | |
| 578 | avg_filter_1bpp_loop: |
| 579 | add r1,r1,r12 @; loop counter += bpp |
| 580 | cmp r1,r0 |
| 581 | |
| 582 | |
| 583 | vld1.8 {d2[0]},[r2] @; load 1 byte (pixel x) from curr |
| 584 | @; row into d2[0] |
| 585 | vld1.8 {d1[0]},[r3]! @; load 1 byte (pixel b) from prev |
| 586 | @; row into d1[0] |
| 587 | vaddl.u8 q2,d0,d1 @; q2 = (pixel a + pixel b) |
| 588 | vshrn.i16 d1,q2,#1 @; d1[0] = (a + b)/2 |
| 589 | vadd.i8 d0,d2,d1 @; d0[0] = x + ((a + b)/2) |
| 590 | vst1.8 {d0[0]},[r2]! @; store 1 byte (updated pixel x) |
| 591 | @; increment curr row pointer |
| 592 | bne avg_filter_1bpp_loop |
| 593 | |
| 594 | b DONE @; exit loop when |
| 595 | @; loop counter == rowbytes |
| 596 | #;; ----------------------------- |
| 597 | #;; AVG filter, 2 bytes per pixel |
| 598 | #;; ----------------------------- |
| 599 | avg_filter_2bpp: |
| 600 | |
| 601 | cmp r1,r0 |
| 602 | |
| 603 | vld1.16 {d0[0]},[r2] @; load 2 bytes (pixel x) from curr |
| 604 | @; row into d0[0] |
| 605 | vld1.16 {d1[0]},[r3]! @; load 2 bytes (pixel b) from prev |
| 606 | @; row into d1[0] |
| 607 | @; increment prev row pointer |
| 608 | vsra.u8 d0,d1,#1 @; shift right pixel b by 1 and add |
| 609 | @; to pixel x |
| 610 | vst1.16 {d0[0]},[r2]! @; store 2 bytes (updated pixel x) |
| 611 | @; increment curr row pointer |
| 612 | @; updated pixel x is now pixel a |
| 613 | beq DONE |
| 614 | |
| 615 | avg_filter_2bpp_loop: |
| 616 | add r1,r1,r12 @; loop counter += bpp |
| 617 | cmp r1,r0 |
| 618 | |
| 619 | |
| 620 | vld1.16 {d2[0]},[r2] @; load 2 bytes (pixel x) from curr |
| 621 | @; row into d2[0] |
| 622 | vld1.16 {d1[0]},[r3]! @; load 2 bytes (pixel b) from prev |
| 623 | @; row into d1[0] |
| 624 | vaddl.u8 q2,d0,d1 @; q2 = (pixel a + pixel b) |
| 625 | vshrn.i16 d1,q2,#1 @; d1[0] = (a + b)/2 |
| 626 | vadd.i8 d0,d2,d1 @; d0[0] = x + ((a + b)/2) |
| 627 | vst1.16 {d0[0]},[r2]! @; store 2 bytes (updated pixel x) |
| 628 | @; increment curr row pointer |
| 629 | |
| 630 | bne avg_filter_2bpp_loop |
| 631 | |
| 632 | b DONE @; exit loop when |
| 633 | @; loop counter == rowbytes |
| 634 | |
| 635 | #;; ----------------------------- |
| 636 | #;; AVG filter, 3 bytes per pixel |
| 637 | #;; ----------------------------- |
| 638 | avg_filter_3bpp: |
| 639 | |
| 640 | cmp r1,r0 |
| 641 | |
| 642 | vld1.32 {d0[0]},[r2] @; load 4 bytes (pixel x + 1 extra |
| 643 | @; byte) from curr row into d0[0] |
| 644 | vld1.32 {d1[0]},[r3],r12 @; load 4 bytes (pixel b + 1 extra |
| 645 | @; byte) from prev row into d1[0] |
| 646 | @; increment prev row pointer |
| 647 | vsra.u8 d0,d1,#1 @; shift right pixel b by 1 and add |
| 648 | @; to pixel x |
| 649 | vst1.16 {d0[0]},[r2]! @; store 2 bytes (updated pixel x) |
| 650 | @; increment curr row pointer |
| 651 | vst1.8 {d0[2]},[r2]! @; store 1 byte (updated pixel x) |
| 652 | @; increment curr row pointer |
| 653 | @; updated pixel x is now pixel a |
| 654 | beq DONE |
| 655 | |
| 656 | avg_filter_3bpp_loop: |
| 657 | add r1,r1,r12 @; loop counter += bpp |
| 658 | cmp r1,r0 |
| 659 | |
| 660 | vld1.32 {d2[0]},[r2] @; load 4 bytes (pixel x + 1 extra |
| 661 | @; byte) from curr row into d2[0] |
| 662 | vld1.32 {d1[0]},[r3],r12 @; load 4 bytes (pixel b + 1 extra |
| 663 | @; byte) from prev row into d1[0] |
| 664 | vaddl.u8 q2,d0,d1 @; q2 = (pixel a + pixel b) |
| 665 | vshrn.i16 d1,q2,#1 @; d1[0] = (a + b)/2 |
| 666 | vadd.i8 d0,d2,d1 @; d0[0] = x + ((a + b)/2) |
| 667 | vst1.16 {d0[0]},[r2]! @; store 2 bytes (updated pixel x) |
| 668 | @; increment curr row pointer |
| 669 | vst1.8 {d0[2]},[r2]! @; store 1 byte (updated pixel x) |
| 670 | @; increment curr row pointer |
| 671 | |
| 672 | bne avg_filter_3bpp_loop |
| 673 | |
| 674 | b DONE @; exit loop when |
| 675 | @; loop counter == rowbytes |
| 676 | #;; ----------------------------- |
| 677 | #;; AVG filter, 4 bytes per pixel |
| 678 | #;; ----------------------------- |
| 679 | avg_filter_4bpp: |
| 680 | |
| 681 | cmp r1,r0 |
| 682 | |
| 683 | vld1.32 {d0[0]},[r2] @; load 4 bytes (pixel x) from curr |
| 684 | @; row into d0[0] |
| 685 | vld1.32 {d1[0]},[r3]! @; load 4 bytes (pixel b) from prev |
| 686 | @; row into d1[0] |
| 687 | @; increment prev row pointer |
| 688 | vsra.u8 d0,d1,#1 @; shift right pixel b by 1 and add |
| 689 | @; to pixel x |
| 690 | vst1.32 {d0[0]},[r2]! @; store 4 bytes (updated pixel x) |
| 691 | @; increment curr row pointer |
| 692 | @; updated pixel x is now pixel a |
| 693 | beq DONE |
| 694 | |
| 695 | avg_filter_4bpp_loop: |
| 696 | add r1,r1,r12 @; loop counter += bpp |
| 697 | cmp r1,r0 |
| 698 | |
| 699 | |
| 700 | vld1.32 {d2[0]},[r2] @; load 4 bytes (pixel x) from curr |
| 701 | @; row into d2[0] |
| 702 | vld1.32 {d1[0]},[r3]! @; load 4 bytes (pixel b) from prev |
| 703 | @; row into d1[0] |
| 704 | vaddl.u8 q2,d0,d1 @; q2 = (pixel a + pixel b) |
| 705 | vshrn.i16 d1,q2,#1 @; d1[0] = (a + b)/2 |
| 706 | vadd.i8 d0,d2,d1 @; d0[0] = x + ((a + b)/2) |
| 707 | vst1.32 {d0[0]},[r2]! @; store 4 bytes (updated pixel x) |
| 708 | @; increment curr row pointer |
| 709 | bne avg_filter_4bpp_loop |
| 710 | |
| 711 | b DONE @; exit loop when |
| 712 | @; loop counter == rowbytes |
| 713 | #;; ----------------------------- |
| 714 | #;; AVG filter, 6 bytes per pixel |
| 715 | #;; ----------------------------- |
| 716 | avg_filter_6bpp: |
| 717 | |
| 718 | cmp r1,r0 |
| 719 | |
| 720 | vld1.8 {d0},[r2] @; load 8 bytes (pixel x + 2 extra |
| 721 | @; bytes) from curr row into d0 |
| 722 | vld1.8 {d1},[r3],r12 @; load 8 bytes (pixel b + 2 extra |
| 723 | @; bytes) from prev row into d1 |
| 724 | @; increment prev row pointer |
| 725 | vsra.u8 d0,d1,#1 @; shift right pixel b by 1 and add |
| 726 | @; to pixel x |
| 727 | vst1.32 {d0[0]},[r2]! @; store 4 bytes (updated pixel x) |
| 728 | @; increment curr row pointer |
| 729 | @; updated pixel x is now pixel a |
| 730 | vst1.16 {d0[2]},[r2]! @; store 2 bytes (updated pixel x) |
| 731 | @; increment curr row pointer |
| 732 | @; updated pixel x is now pixel a |
| 733 | beq DONE |
| 734 | |
| 735 | avg_filter_6bpp_loop: |
| 736 | add r1,r1,r12 @; loop counter += bpp |
| 737 | cmp r1,r0 |
| 738 | |
| 739 | |
| 740 | vld1.8 {d2},[r2] @; load 8 bytes (pixel x + 2 extra |
| 741 | @; bytes) from curr row into d2 |
| 742 | vld1.8 {d1},[r3],r12 @; load 8 bytes (pixel b + 2 extra |
| 743 | @; bytes) from prev row into d1 |
| 744 | vaddl.u8 q2,d0,d1 @; q2 = (pixel a + pixel b) |
| 745 | vshrn.i16 d1,q2,#1 @; d1 = (a + b)/2 |
| 746 | vadd.i8 d0,d2,d1 @; d0 = x + ((a + b)/2) |
| 747 | vst1.32 {d0[0]},[r2]! @; store 4 bytes (updated pixel x) |
| 748 | @; increment curr row pointer |
| 749 | vst1.16 {d0[2]},[r2]! @; store 2 bytes (updated pixel x) |
| 750 | @; increment curr row pointer |
| 751 | bne avg_filter_6bpp_loop |
| 752 | |
| 753 | b DONE @; exit loop when |
| 754 | @; loop counter == rowbytes |
| 755 | #;; ----------------------------- |
| 756 | #;; AVG filter, 8 bytes per pixel |
| 757 | #;; ----------------------------- |
| 758 | avg_filter_8bpp: |
| 759 | |
| 760 | cmp r1,r0 |
| 761 | |
| 762 | vld1.8 {d0},[r2] @; load 8 bytes (pixel x) from curr |
| 763 | @; row into d0 |
| 764 | vld1.8 {d1},[r3]! @; load 8 bytes (pixel b) from prev |
| 765 | @; row into d1 |
| 766 | @; increment prev row pointer |
| 767 | vsra.u8 d0,d1,#1 @; shift right pixel b by 1 and add |
| 768 | @; to pixel x |
| 769 | vst1.8 {d0},[r2]! @; store 8 bytes (updated pixel x) |
| 770 | @; increment curr row pointer |
| 771 | @; updated pixel x is now pixel a |
| 772 | beq DONE |
| 773 | avg_filter_8bpp_loop: |
| 774 | add r1,r1,r12 @; loop counter += bpp |
| 775 | cmp r1,r0 |
| 776 | |
| 777 | |
| 778 | vld1.8 {d2},[r2] @; load 8 bytes (pixel x) from curr |
| 779 | @; row into d2 |
| 780 | vld1.8 {d1},[r3]! @; load 8 bytes (pixel b) from prev |
| 781 | @; row into d1 |
| 782 | vaddl.u8 q2,d0,d1 @; q2 = (pixel a + pixel b) |
| 783 | vshrn.i16 d1,q2,#1 @; d1 = (a + b)/2 |
| 784 | vadd.i8 d0,d2,d1 @; d0 = x + ((a + b)/2) |
| 785 | vst1.8 {d0},[r2]! @; store 8 bytes (updated pixel x) |
| 786 | @; increment curr row pointer |
| 787 | bne avg_filter_8bpp_loop |
| 788 | |
| 789 | b DONE @; exit loop when |
| 790 | @; loop counter == rowbytes |
| 791 | #;; ----------------- |
| 792 | #;; PAETH filter type |
| 793 | #;; ----------------- |
| 794 | paeth_filter: |
| 795 | |
| 796 | VPUSH {q4-q7} |
| 797 | add r1,r1,#7 @; bpp = bytes per pixel |
| 798 | lsr r1,r1,#3 @; = (pixel_depth + 7) >> 3 |
| 799 | mov r12,r1 |
| 800 | |
| 801 | #;; r0 = rowbytes |
| 802 | #;; r1 = loop counter = bpp (initially) |
| 803 | #;; r2 = row pointer |
| 804 | #;; r3 = previous row pointer |
| 805 | #;; r12 = bpp = loop/pointer increment value |
| 806 | |
| 807 | |
| 808 | cmp r12,#1 |
| 809 | beq paeth_filter_1bpp |
| 810 | |
| 811 | cmp r12,#2 |
| 812 | beq paeth_filter_2bpp |
| 813 | |
| 814 | cmp r12,#3 |
| 815 | beq paeth_filter_3bpp |
| 816 | |
| 817 | cmp r12,#4 |
| 818 | beq paeth_filter_4bpp |
| 819 | |
| 820 | cmp r12,#6 |
| 821 | beq paeth_filter_6bpp |
| 822 | |
| 823 | cmp r12,#8 |
| 824 | beq paeth_filter_8bpp |
| 825 | |
| 826 | paeth_filter_exit: |
| 827 | b paeth_filter_DONE @; return |
| 828 | |
| 829 | #;; ------------------------------ |
| 830 | #;; PAETH filter, 1 byte per pixel |
| 831 | #;; ------------------------------ |
| 832 | paeth_filter_1bpp: |
| 833 | |
| 834 | cmp r1, r0 |
| 835 | |
| 836 | vld1.8 {d0[0]},[r2] @; load 1 byte (pixel x) from curr |
| 837 | @; row into d0[0] |
| 838 | vld1.8 {d1[0]},[r3]! @; load 1 byte (pixel b) from prev |
| 839 | @; row into d1[0] |
| 840 | @; increment prev row pointer |
| 841 | vadd.i8 d2,d0,d1 @; d2 = x + b = updated pixel x |
| 842 | vst1.8 {d2[0]},[r2]! @; store 1 byte (updated pixel x) |
| 843 | @; increment curr row pointer |
| 844 | |
| 845 | beq paeth_filter_DONE |
| 846 | |
| 847 | paeth_filter_1bpp_loop: |
| 848 | add r1,r1,r12 @; increment curr row pointer |
| 849 | cmp r1,r0 |
| 850 | |
| 851 | |
| 852 | #;; d1[0] = c (b in the previous loop iteration) |
| 853 | #;; d2[0] = a (x in the previous loop iteration) |
| 854 | vld1.8 {d3[0]},[r3]! @; load 1 byte (pixel b) from prev |
| 855 | @; row into d3[0] |
| 856 | vld1.8 {d0[0]},[r2] @; load 1 byte (pixel x) from curr |
| 857 | @; row into d0[0] |
| 858 | vshll.u8 q4,d1,#1 @; q4 = c<<1 = 2c |
| 859 | vabdl.u8 q3,d2,d1 @; q3 = pb = abs(a - c) |
| 860 | vabdl.u8 q2,d3,d1 @; q2 = pa = abs(b - c) |
| 861 | vaddl.u8 q5,d2,d3 @; q5 = a + b |
| 862 | vabd.u16 q4,q5,q4 @; q4 = pc = abs(a + b - 2c) |
| 863 | |
| 864 | vcle.s16 q5,q2,q3 @; q5 = (pa <= pb) |
| 865 | vcle.s16 q6,q2,q4 @; q6 = (pa <= pc) |
| 866 | vand q5,q5,q6 @; q5 = ((pa <= pb) && (pa <= pc)) |
| 867 | vcle.s16 q7,q3,q4 @; q7 = (pb <= pc) |
| 868 | vshrn.u16 d10,q5,#8 @; d10 = ((pa <= pb) && (pa <= pc)) |
| 869 | vshrn.u16 d14,q7,#8 @; d14 = (pb <= pc) |
| 870 | @ |
| 871 | vand d2,d2,d10 @; d2 = a where 1, 0 where 0 |
| 872 | vbsl d14,d3,d1 @; d14 = b where 1, c where 0 |
| 873 | vmvn d10,d10 @; invert d10 |
| 874 | vand d14,d14,d10 @; d14 = b/c where 1, 0 where 0 |
| 875 | vadd.i8 d2,d2,d14 @; d2 = p = a/b/c where appropriate |
| 876 | vadd.i8 d2,d2,d0 @; d2 = x + p (updated pixel x) |
| 877 | vmov d1,d3 @; d1 = b (c for next iteration) |
| 878 | vst1.8 {d2[0]},[r2]! @; store 1 byte (updated pixel x) |
| 879 | |
| 880 | |
| 881 | bne paeth_filter_1bpp_loop |
| 882 | |
| 883 | b paeth_filter_DONE @; exit loop when |
| 884 | @; loop counter == rowbytes |
| 885 | #;; ------------------------------- |
| 886 | #;; PAETH filter, 2 bytes per pixel |
| 887 | #;; ------------------------------- |
| 888 | paeth_filter_2bpp: |
| 889 | |
| 890 | cmp r1, r0 |
| 891 | |
| 892 | vld1.16 {d0[0]},[r2] @; load 2 bytes (pixel x) from curr |
| 893 | @; row into d0[0] |
| 894 | vld1.16 {d1[0]},[r3]! @; load 2 bytes (pixel b) from prev |
| 895 | @; row into d1[0] |
| 896 | @; increment prev row pointer |
| 897 | vadd.i8 d2,d0,d1 @; d2 = x + b = updated pixel x |
| 898 | vst1.16 {d2[0]},[r2]! @; store 2 bytes (updated pixel x) |
| 899 | @; increment curr row pointer |
| 900 | beq paeth_filter_DONE |
| 901 | |
| 902 | paeth_filter_2bpp_loop: |
| 903 | add r1,r1,r12 @; loop counter += bpp |
| 904 | cmp r1,r0 |
| 905 | |
| 906 | #;; d1[0] = c (b in the previous loop iteration) |
| 907 | #;; d2[0] = a (x in the previous loop iteration) |
| 908 | vld1.16 {d3[0]},[r3]! @; load 2 bytes (pixel b) from prev |
| 909 | @; row into d3[0] |
| 910 | vld1.16 {d0[0]},[r2] @; load 2 bytes (pixel x) from curr |
| 911 | @; row into d0[0] |
| 912 | vshll.u8 q4,d1,#1 @; q4 = c<<1 = 2c |
| 913 | vabdl.u8 q3,d2,d1 @; q3 = pb = abs(a - c) |
| 914 | vabdl.u8 q2,d3,d1 @; q2 = pa = abs(b - c) |
| 915 | vaddl.u8 q5,d2,d3 @; q5 = a + b |
| 916 | vabd.u16 q4,q5,q4 @; q4 = pc = abs(a + b - 2c) |
| 917 | |
| 918 | vcle.s16 q5,q2,q3 @; q5 = (pa <= pb) |
| 919 | vcle.s16 q6,q2,q4 @; q6 = (pa <= pc) |
| 920 | vand q5,q5,q6 @; q5 = ((pa <= pb) && (pa <= pc)) |
| 921 | vcle.s16 q7,q3,q4 @; q7 = (pb <= pc) |
| 922 | vshrn.u16 d10,q5,#8 @; d10 = ((pa <= pb) && (pa <= pc)) |
| 923 | vshrn.u16 d14,q7,#8 @; d14 = (pb <= pc) |
| 924 | |
| 925 | vand d2,d2,d10 @; d2 = a where 1, 0 where 0 |
| 926 | vbsl d14,d3,d1 @; d14 = b where 1, c where 0 |
| 927 | vmvn d10,d10 @; invert d10 |
| 928 | vand d14,d14,d10 @; d14 = b/c where 1, 0 where 0 |
| 929 | vadd.i8 d2,d2,d14 @; d2 = p = a/b/c where appropriate |
| 930 | vadd.i8 d2,d2,d0 @; d2 = x + p (updated pixel x) |
| 931 | vmov d1,d3 @; d1 = b (c for next iteration) |
| 932 | vst1.16 {d2[0]},[r2]! @; store 2 bytes (updated pixel x) |
| 933 | @; increment curr row pointer |
| 934 | bne paeth_filter_2bpp_loop |
| 935 | |
| 936 | b paeth_filter_DONE @; exit loop when |
| 937 | @; loop counter == rowbytes |
| 938 | #;; ------------------------------- |
| 939 | #;; PAETH filter, 3 bytes per pixel |
| 940 | #;; ------------------------------- |
| 941 | paeth_filter_3bpp: |
| 942 | |
| 943 | cmp r1, r0 |
| 944 | |
| 945 | vld1.32 {d0[0]},[r2] @; load 4 bytes (pixel x + 1 extra |
| 946 | @; byte) from curr row into d0[0] |
| 947 | vld1.32 {d1[0]},[r3],r12 @; load 4 bytes (pixel b + 1 extra |
| 948 | @; byte) from prev row into d1[0] |
| 949 | @; increment prev row pointer |
| 950 | vadd.i8 d2,d0,d1 @; d2 = x + b = updated pixel x |
| 951 | vst1.16 {d2[0]},[r2]! @; store 2 bytes (updated pixel x) |
| 952 | @; increment curr row pointer |
| 953 | vst1.8 {d2[2]},[r2]! @; store 1 byte (updated pixel x) |
| 954 | @; increment curr row pointer |
| 955 | beq paeth_filter_DONE |
| 956 | |
| 957 | paeth_filter_3bpp_loop: |
| 958 | add r1,r1,r12 @; loop counter += bpp |
| 959 | cmp r1,r0 |
| 960 | |
| 961 | |
| 962 | #;; d1[0] = c (b in the previous loop iteration) |
| 963 | #;; d2[0] = a (x in the previous loop iteration) |
| 964 | vld1.32 {d3[0]},[r3],r12 @; load 4 bytes (pixel b + 1 extra |
| 965 | @; byte) from prev row into d3[0] |
| 966 | vld1.32 {d0[0]},[r2] @; load 4 bytes (pixel x + 1 extra |
| 967 | @; byte) from curr row into d0[0] |
| 968 | vshll.u8 q4,d1,#1 @; q4 = c<<1 = 2c |
| 969 | vabdl.u8 q3,d2,d1 @; q3 = pb = abs(a - c) |
| 970 | vabdl.u8 q2,d3,d1 @; q2 = pa = abs(b - c) |
| 971 | vaddl.u8 q5,d2,d3 @; q5 = a + b |
| 972 | vabd.u16 q4,q5,q4 @; q4 = pc = abs(a + b - 2c) |
| 973 | @ |
| 974 | vcle.s16 q5,q2,q3 @; q5 = (pa <= pb) |
| 975 | vcle.s16 q6,q2,q4 @; q6 = (pa <= pc) |
| 976 | vand q5,q5,q6 @; q5 = ((pa <= pb) && (pa <= pc)) |
| 977 | vcle.s16 q7,q3,q4 @; q7 = (pb <= pc) |
| 978 | vshrn.u16 d10,q5,#8 @; d10 = ((pa <= pb) && (pa <= pc)) |
| 979 | vshrn.u16 d14,q7,#8 @; d14 = (pb <= pc) |
| 980 | @ |
| 981 | vand d2,d2,d10 @; d2 = a where 1, 0 where 0 |
| 982 | vbsl d14,d3,d1 @; d14 = b where 1, c where 0 |
| 983 | vmvn d10,d10 @; invert d10 |
| 984 | vand d14,d14,d10 @; d14 = b/c where 1, 0 where 0 |
| 985 | vadd.i8 d2,d2,d14 @; d2 = p = a/b/c where appropriate |
| 986 | vadd.i8 d2,d2,d0 @; d2 = x + p (updated pixel x) |
| 987 | vmov d1,d3 @; d1 = b (c for next iteration) |
| 988 | vst1.16 {d2[0]},[r2]! @; store 2 bytes (updated pixel x) |
| 989 | @; increment curr row pointer |
| 990 | vst1.8 {d2[2]},[r2]! @; store 1 byte (updated pixel x) |
| 991 | @; increment curr row pointer |
| 992 | bne paeth_filter_3bpp_loop |
| 993 | |
| 994 | b paeth_filter_DONE @; exit loop when |
| 995 | @; loop counter == rowbytes |
| 996 | #;; ------------------------------- |
| 997 | #;; PAETH filter, 4 bytes per pixel |
| 998 | #;; ------------------------------- |
| 999 | paeth_filter_4bpp: |
| 1000 | |
| 1001 | cmp r1, r0 |
| 1002 | |
| 1003 | vld1.32 {d0[0]},[r2] @; load 4 bytes (pixel x) from curr |
| 1004 | @; row into d0[0] |
| 1005 | vld1.32 {d1[0]},[r3]! @; load 4 bytes (pixel b) from prev |
| 1006 | @; row into d1[0] |
| 1007 | @; increment prev row pointer |
| 1008 | vadd.i8 d2,d0,d1 @; d2 = x + b = updated pixel x |
| 1009 | vst1.32 {d2[0]},[r2]! @; store 4 bytes (updated pixel x) |
| 1010 | @; increment curr row pointer |
| 1011 | beq paeth_filter_DONE |
| 1012 | |
| 1013 | paeth_filter_4bpp_loop: |
| 1014 | add r1,r1,r12 @; loop counter += bpp |
| 1015 | cmp r1,r0 |
| 1016 | |
| 1017 | |
| 1018 | #;; d1[0] = c (b in the previous loop iteration) |
| 1019 | #;; d2[0] = a (x in the previous loop iteration) |
| 1020 | vld1.32 {d3[0]},[r3]! @; load 4 bytes (pixel b) from prev |
| 1021 | @; row into d3[0] |
| 1022 | vld1.32 {d0[0]},[r2] @; load 4 bytes (pixel x) from curr |
| 1023 | @; row into d0[0] |
| 1024 | vshll.u8 q4,d1,#1 @; q4 = c<<1 = 2c |
| 1025 | vabdl.u8 q3,d2,d1 @; q3 = pb = abs(a - c) |
| 1026 | vabdl.u8 q2,d3,d1 @; q2 = pa = abs(b - c) |
| 1027 | vaddl.u8 q5,d2,d3 @; q5 = a + b |
| 1028 | vabd.u16 q4,q5,q4 @; q4 = pc = abs(a + b - 2c) |
| 1029 | @ |
| 1030 | vcle.s16 q5,q2,q3 @; q5 = (pa <= pb) |
| 1031 | vcle.s16 q6,q2,q4 @; q6 = (pa <= pc) |
| 1032 | vand q5,q5,q6 @; q5 = ((pa <= pb) && (pa <= pc)) |
| 1033 | vcle.s16 q7,q3,q4 @; q7 = (pb <= pc) |
| 1034 | vshrn.u16 d10,q5,#8 @; d10 = ((pa <= pb) && (pa <= pc)) |
| 1035 | vshrn.u16 d14,q7,#8 @; d14 = (pb <= pc) |
| 1036 | @ |
| 1037 | vand d2,d2,d10 @; d2 = a where 1, 0 where 0 |
| 1038 | vbsl d14,d3,d1 @; d14 = b where 1, c where 0 |
| 1039 | vmvn d10,d10 @; invert d10 |
| 1040 | vand d14,d14,d10 @; d14 = b/c where 1, 0 where 0 |
| 1041 | vadd.i8 d2,d2,d14 @; d2 = p = a/b/c where appropriate |
| 1042 | vadd.i8 d2,d2,d0 @; d2 = x + p (updated pixel x) |
| 1043 | vmov d1,d3 @; d1 = b (c for next iteration) |
| 1044 | vst1.32 {d2[0]},[r2]! @; store 4 bytes (updated pixel x) |
| 1045 | @; increment curr row pointer |
| 1046 | bne paeth_filter_4bpp_loop |
| 1047 | |
| 1048 | b paeth_filter_DONE @; exit loop when |
| 1049 | @; loop counter == rowbytes |
| 1050 | #;; ------------------------------- |
| 1051 | #;; PAETH filter, 6 bytes per pixel |
| 1052 | #;; ------------------------------- |
| 1053 | paeth_filter_6bpp: |
| 1054 | cmp r1, r0 |
| 1055 | |
| 1056 | vld1.8 {d0},[r2] @; load 8 bytes (pixel x + 2 extra |
| 1057 | @; bytes) from curr row into d0 |
| 1058 | vld1.8 {d1},[r3],r12 @; load 8 bytes (pixel b + 2 extra |
| 1059 | @; bytes) from prev row into d1 |
| 1060 | @; increment prev row pointer |
| 1061 | vadd.i8 d2,d0,d1 @; d2 = x + b = updated pixel x |
| 1062 | vst1.32 {d2[0]},[r2]! @; store 4 bytes (updated pixel x) |
| 1063 | @; increment curr row pointer |
| 1064 | vst1.16 {d2[2]},[r2]! @; store 2 bytes (updated pixel x) |
| 1065 | @; increment curr row pointer |
| 1066 | beq paeth_filter_DONE |
| 1067 | |
| 1068 | paeth_filter_6bpp_loop: |
| 1069 | add r1,r1,r12 @; loop counter += bpp |
| 1070 | cmp r1,r0 |
| 1071 | |
| 1072 | |
| 1073 | #;; d1[0] = c (b in the previous loop iteration) |
| 1074 | #;; d2[0] = a (x in the previous loop iteration) |
| 1075 | vld1.8 {d3},[r3],r12 @; load 8 bytes (pixel b + 2 extra |
| 1076 | @; bytes) from prev row into d3 |
| 1077 | vld1.8 {d0},[r2] @; load 8 bytes (pixel x + 2 extra |
| 1078 | @; bytes) from curr row into d0 |
| 1079 | vshll.u8 q4,d1,#1 @; q4 = c<<1 = 2c |
| 1080 | vabdl.u8 q3,d2,d1 @; q3 = pb = abs(a - c) |
| 1081 | vabdl.u8 q2,d3,d1 @; q2 = pa = abs(b - c) |
| 1082 | vaddl.u8 q5,d2,d3 @; q5 = a + b |
| 1083 | vabd.u16 q4,q5,q4 @; q4 = pc = abs(a + b - 2c) |
| 1084 | |
| 1085 | vcle.s16 q5,q2,q3 @; q5 = (pa <= pb) |
| 1086 | vcle.s16 q6,q2,q4 @; q6 = (pa <= pc) |
| 1087 | vand q5,q5,q6 @; q5 = ((pa <= pb) && (pa <= pc)) |
| 1088 | vcle.s16 q7,q3,q4 @; q7 = (pb <= pc) |
| 1089 | vshrn.u16 d10,q5,#8 @; d10 = ((pa <= pb) && (pa <= pc)) |
| 1090 | vshrn.u16 d14,q7,#8 @; d14 = (pb <= pc) |
| 1091 | |
| 1092 | vand d2,d2,d10 @; d2 = a where 1, 0 where 0 |
| 1093 | vbsl d14,d3,d1 @; d14 = b where 1, c where 0 |
| 1094 | vmvn d10,d10 @; invert d10 |
| 1095 | vand d14,d14,d10 @; d14 = b/c where 1, 0 where 0 |
| 1096 | vadd.i8 d2,d2,d14 @; d2 = p = a/b/c where appropriate |
| 1097 | vadd.i8 d2,d2,d0 @; d2 = x + p (updated pixel x) |
| 1098 | vmov d1,d3 @; d1 = b (c for next iteration) |
| 1099 | vst1.32 {d2[0]},[r2]! @; store 4 bytes (updated pixel x) |
| 1100 | @; increment curr row pointer |
| 1101 | vst1.16 {d2[2]},[r2]! @; store 2 bytes (updated pixel x) |
| 1102 | @; increment curr row pointer |
| 1103 | bne paeth_filter_6bpp_loop |
| 1104 | |
| 1105 | b paeth_filter_DONE @; exit loop when |
| 1106 | @; loop counter == rowbytes |
| 1107 | #;; ------------------------------- |
| 1108 | #;; PAETH filter, 8 bytes per pixel |
| 1109 | #;; ------------------------------- |
| 1110 | paeth_filter_8bpp: |
| 1111 | cmp r1, r0 |
| 1112 | |
| 1113 | vld1.8 {d0},[r2] @; load 8 bytes (pixel x) from curr |
| 1114 | @; row into d0 |
| 1115 | vld1.8 {d1},[r3]! @; load 8 bytes (pixel b) from prev |
| 1116 | @; row into d1 |
| 1117 | @; increment prev row pointer |
| 1118 | vadd.i8 d2,d0,d1 @; d2 = x + b = updated pixel x |
| 1119 | vst1.8 {d2},[r2]! @; store 8 bytes (updated pixel x) |
| 1120 | @; increment curr row pointer |
| 1121 | beq paeth_filter_DONE |
| 1122 | |
| 1123 | paeth_filter_8bpp_loop: |
| 1124 | add r1,r1,r12 @; loop counter += bpp |
| 1125 | cmp r1,r0 |
| 1126 | |
| 1127 | |
| 1128 | #;; d1[0] = c (b in the previous loop iteration) |
| 1129 | #;; d2[0] = a (x in the previous loop iteration) |
| 1130 | vld1.8 {d3},[r3]! @; load 8 bytes (pixel b) from prev |
| 1131 | @; row into d3 |
| 1132 | vld1.8 {d0},[r2] @; load 8 bytes (pixel x) from curr |
| 1133 | @; row into d0 |
| 1134 | vshll.u8 q4,d1,#1 @; q4 = c<<1 = 2c |
| 1135 | vabdl.u8 q3,d2,d1 @; q3 = pb = abs(a - c) |
| 1136 | vabdl.u8 q2,d3,d1 @; q2 = pa = abs(b - c) |
| 1137 | vaddl.u8 q5,d2,d3 @; q5 = a + b |
| 1138 | vabd.u16 q4,q5,q4 @; q4 = pc = abs(a + b - 2c) |
| 1139 | @ |
| 1140 | vcle.s16 q5,q2,q3 @; q5 = (pa <= pb) |
| 1141 | vcle.s16 q6,q2,q4 @; q6 = (pa <= pc) |
| 1142 | vand q5,q5,q6 @; q5 = ((pa <= pb) && (pa <= pc)) |
| 1143 | vcle.s16 q7,q3,q4 @; q7 = (pb <= pc) |
| 1144 | vshrn.u16 d10,q5,#8 @; d10 = ((pa <= pb) && (pa <= pc)) |
| 1145 | vshrn.u16 d14,q7,#8 @; d14 = (pb <= pc) |
| 1146 | @ |
| 1147 | vand d2,d2,d10 @; d2 = a where 1, 0 where 0 |
| 1148 | vbsl d14,d3,d1 @; d14 = b where 1, c where 0 |
| 1149 | vmvn d10,d10 @; invert d10 |
| 1150 | vand d14,d14,d10 @; d14 = b/c where 1, 0 where 0 |
| 1151 | vadd.i8 d2,d2,d14 @; d2 = p = a/b/c where appropriate |
| 1152 | vadd.i8 d2,d2,d0 @; d2 = x + p (updated pixel x) |
| 1153 | vmov d1,d3 @; d1 = b (c for next iteration) |
| 1154 | vst1.8 {d2},[r2]! @; store 8 bytes (updated pixel x) |
| 1155 | @; increment curr row pointer |
| 1156 | bne paeth_filter_8bpp_loop |
| 1157 | |
| 1158 | b paeth_filter_DONE @; exit loop when |
| 1159 | @; loop counter == rowbytes |
| 1160 | paeth_filter_DONE: |
| 1161 | |
| 1162 | VPOP {q4-q7} |
| 1163 | bx r14 |
| 1164 | |
| 1165 | DONE: |
| 1166 | bx r14 |
| 1167 | |
| 1168 | |
| 1169 | .size png_read_filter_row_neon, .-png_read_filter_row_neon |
| 1170 | .END |