blob: 1a45745aed005d8d58f9f9479b26a184c7558c1c [file] [log] [blame]
Dinesh K Garg0fdb3372010-12-28 15:43:58 -08001#; Copyright (c) 2010-2011, Code Aurora Forum. All rights reserved.
2#;
3#; Redistribution and use in source and binary forms, with or without
4#; modification, are permitted provided that the following conditions are
5#; met:
6#; * Redistributions of source code must retain the above copyright
7#; notice, this list of conditions and the following disclaimer.
8#; * Redistributions in binary form must reproduce the above
9#; copyright notice, this list of conditions and the following
10#; disclaimer in the documentation and/or other materials provided
11#; with the distribution.
12#; * Neither the name of Code Aurora Forum, Inc. nor the names of its
13#; contributors may be used to endorse or promote products derived
14#; from this software without specific prior written permission.
15#;
16#; THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED
17#; WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
18#; MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT
19#; ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
20#; BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21#; CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22#; SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
23#; BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
24#; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
25#; OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
26#; IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28#;==============================================================================
29
30 .code 32 @; Code is ARM ISA
31#;==============================================================================
32
33 .global png_read_filter_row_neon
34
35#;==============================================================================
36#; INPUTS: r0 rowbytes: number of bytes in current row
37#; r1 pixel_depth: number of bits per pixel
38#; r2 row: pointer to start of current row
39#; r3 prev_row: pointer to start of previous row
40#; [sp,#0] filter: filter type
41#;
42#; NOTE: Don't touch r5-r11
43#;==============================================================================
44.balign 32
45.type png_read_filter_row_neon, %function
46png_read_filter_row_neon:
47
48 ldr r12,[sp,#0]
49
50 cmp r12,#0
51 beq DONE
52
53 cmp r12,#1
54 beq sub_filter
55
56 cmp r12,#2
57 beq up_filter
58
59 cmp r12,#3
60 beq avg_filter
61
62 cmp r12,#4
63 beq paeth_filter
64
65 b DONE
66
67 #;; ---------------
68 #;; SUB filter type
69 #;; ---------------
70
71
72sub_filter:
73
74 stmdb sp!, {r4}
75
76 add r1,r1,#7 @; bpp = bytes per pixel
77 lsr r1,r1,#3 @; = (pixel_depth + 7) >> 3
78 mov r12,r1
79
80 #;; r0 = rowbytes
81 #;; r1 = loop counter = bpp (initially)
82 #;; r2 = row pointer
83 #;; r12 = bpp = loop/pointer increment value
84
85 cmp r1,r0
86 beq sub_filter_exit @; exit if bpp == rowbytes
87
88 cmp r12,#1
89 beq sub_filter_1bpp
90
91 cmp r12,#2
92 beq sub_filter_2bpp
93
94 cmp r12,#3
95 beq sub_filter_3bpp
96
97 cmp r12,#4
98 beq sub_filter_4bpp
99
100 cmp r12,#6
101 beq sub_filter_6bpp
102
103 cmp r12,#8
104 beq sub_filter_8bpp
105
106sub_filter_exit:
107 b sub_filter_DONE @; return
108
109
110sub_filter_1bpp:
111
112 #;; ----------------------------
113 #;; SUB filter, 1 byte per pixel
114 #;; ----------------------------
115
116 lsrs r4,r0,#4 @; r1 = floor(rowbytes/4)
117 @; = iteration count for loop16
118 beq sub_filter_1bpp_16bytes_done
119
120 vmov.i8 d21, #0
121 vld1.8 {d16,d17}, [r2] @; load 16 pixels
122 @; d16 = a b c d e f g h
123 @; d17 = i j k l m n o p
124
125 mov r1, #0
126sub_filter_1bpp_16bytes:
127
128
129
130
131 vshl.i64 d18, d16, #8 @; d18 = 0 a b c d e f g
132 vadd.i8 d18, d16, d18 @; d18 = a a+b b+c c+d d+e e+f f+g g+h
133
134 vshl.i64 d18, d18, #8 @; d18 = 0 a a+b b+c c+d d+e e+f f+g
135 vadd.i8 d18, d16, d18 @; d18 = a a+b a+b+c b+c+d c+d+e d+e+f e+f+g f+g+h
136
137 vshl.i64 d18, d18, #8 @; shift add continuously to propage the sum of previous
138 vadd.i8 d18, d16, d18 @; and current pixels
139
140 vshl.i64 d18, d18, #8
141 vadd.i8 d18, d16, d18
142
143 vshl.i64 d18, d18, #8
144 vadd.i8 d18, d16, d18
145
146 vshl.i64 d18, d18, #8
147 vadd.i8 d18, d16, d18
148
149 vshl.i64 d18, d18, #8
150 vadd.i8 d18, d16, d18 @; maximum data size for shift is 64 bits i.e. doubleword.
151 @; after computing thh value of all the pixels in the double word
152 @; extract the last computed value which will be used by
153 @; the next set of pixels (i.e next doubleword)
154 vext.8 d22, d18, d21, #7 @; extract the updated value of d18[7] i.e a+b+c+d+e+f+h
155 vadd.i8 d17, d17, d22 @; d17 = a+b+c+d+e+f+g+h+i j k l m n o p
156
157 vshl.i64 d19, d17, #8 @; continue shift-add as the first half
158 vadd.i8 d19, d17, d19
159
160 vshl.i64 d19, d19, #8
161 vadd.i8 d19, d17, d19
162
163 vshl.i64 d19, d19, #8
164 vadd.i8 d19, d17, d19
165
166 vshl.i64 d19, d19, #8
167 vadd.i8 d19, d17, d19
168
169 vshl.i64 d19, d19, #8
170 vadd.i8 d19, d17, d19
171
172 vshl.i64 d19, d19, #8
173 vadd.i8 d19, d17, d19
174
175 vshl.i64 d19, d19, #8
176 vadd.i8 d19, d17, d19
177
178 vst1.8 {d18,d19},[r2]! @; store the result back
179
180 add r1, r1, #16 @; add 16 to the loop counter(no of bytes completed)
181 subs r4,r4,#1 @; decrement iteration count
182 beq sub_filter_1bpp_16bytes_adjust
183
184
185 vext.8 d22, d19, d21, #7 @; more iterations to go
186 @; extract the last computed value
187 vld1.8 {d16,d17}, [r2] @; load the next 16 bytes
188 vadd.i8 d16, d16, d22 @; set up the input by adding the previous pixel
189 @; value to the input
190 b sub_filter_1bpp_16bytes
191
192sub_filter_1bpp_16bytes_adjust:
193
194 cmp r1, r0 @; no more pixels left .. exit
195 sub r2, r2, #1 @; more pixels remaining
196 @; r2 points to the current pixel adjust it
197 @; so that it points to the prev pixel for the below loop
198 beq sub_filter_DONE
199
200sub_filter_1bpp_16bytes_done:
201
202
203 vld1.8 {d0[0]},[r2]! @; load 1 byte (1 pixel) into D0[0]
204 @; increment row pointer
205sub_filter_1bpp_loop:
206 add r1,r1,r12 @; loop counter += bpp
207 cmp r1,r0 @;
208
209 vld1.8 {d2[0]},[r2] @; load 1 byte (current pixel) into D2[0]
210
211 vadd.i8 d0,d0,d2 @; vector add 1 byte of previous pixel with
212 @; 1 byte of current pixel
213 vst1.8 {d0[0]},[r2]! @; store 1 byte (updated pixel) back
214 @; into row pointer location and increment
215 @; row pointer
216
217 bne sub_filter_1bpp_loop @; loop back until loop counter == rowbytes
218
219 b sub_filter_DONE @; return
220
221 #;; -----------------------------
222 #;; SUB filter, 2 bytes per pixel
223 #;; -----------------------------
224sub_filter_2bpp:
225
226 lsrs r4,r0,#4 @; r1 = floor(rowbytes/4)
227 @; = iteration count for loop16
228 beq sub_filter_2bpp_16bytes_done
229
230 vmov.i8 d21, #0
231 vld1.8 {d16,d17}, [r2] @; load 16 bytes to q8
232 @; d16 = a b c d e f g h
233 @; d17 = i j k l m n o p
234 mov r1, #0
235sub_filter_2bpp_16bytes:
236
237 vshl.i64 d18, d16, #16 @; each pixel is 2bytes .. shift by 16 bits to get previous pixel
238 vadd.i8 d18, d16, d18 @; add to the current pixel
239
240 vshl.i64 d18, d18, #16 @; shift-add to propagate the computed sum as the case for 1bpp
241 vadd.i8 d18, d16, d18
242
243 vshl.i64 d18, d18, #16
244 vadd.i8 d18, d16, d18
245
246
247 vext.8 d22, d18, d21, #6 @; extract the last computed value (i.e. last 2 bytes)
248 vadd.i8 d17, d17, d22 @; add the last computed pixel to the input
249
250 vshl.i64 d19, d17, #16
251 vadd.i8 d19, d17, d19
252
253 vshl.i64 d19, d19, #16
254 vadd.i8 d19, d17, d19
255
256 vshl.i64 d19, d19, #16
257 vadd.i8 d19, d17, d19
258
259
260 vst1.8 {d18,d19},[r2]! @; store the result back
261
262
263 add r1, r1, #16 @; add 16 to the loop counter(no of bytes completed)
264 subs r4,r4,#1 @; decrement iteration count
265 beq sub_filter_2bpp_16bytes_adjust
266
267
268 vext.8 d22, d19, d21, #6 @; extract the last computed value
269 @; add the last computed pixel to the input
270 vld1.8 {d16,d17}, [r2]
271 vadd.i8 d16, d16, d22
272
273 b sub_filter_2bpp_16bytes
274
275
276sub_filter_2bpp_16bytes_adjust:
277
278 cmp r1, r0 @; no more pixels left .. exit
279 sub r2, r2, #2 @; more pixels remaining
280 @; r2 points to the current pixel adjust it
281 @; so that it points to the prev pixel for the below loop
282 beq sub_filter_DONE
283
284sub_filter_2bpp_16bytes_done:
285
286 vld1.16 {d0[0]},[r2]! @; load 2 bytes (1 pixel) into D0[0]
287 @; increment row pointer
288sub_filter_2bpp_loop:
289 add r1,r1,r12 @; loop counter += bpp
290 cmp r1,r0 @;
291
292 vld1.16 {d2[0]},[r2] @; load 2 bytes (current pixel) into D2[0]
293 vadd.i8 d0,d0,d2 @; vector add 2 bytes of previous pixel with
294 @; 2 bytes of current pixel
295 vst1.16 {d0[0]},[r2]! @; store 2 bytes (updated pixel) back
296 @; into row pointer location and increment
297 @; row pointer
298
299 bne sub_filter_2bpp_loop @; loop back until loop counter == rowbytes
300 @
301 b sub_filter_DONE @ ; return
302
303 #;; -----------------------------
304 #;; SUB filter, 3 bytes per pixel
305 #;; -----------------------------
306sub_filter_3bpp:
307 vld1.32 {d0[0]},[r2], r12 @; load 4 bytes (1 pixel + 1 extra byte) into D0[0]
308 @; increment row pointer by bpp
309sub_filter_3bpp_loop:
310 add r1,r1,r12 @; loop counter += bpp
311 cmp r1,r0 @;
312
313 vld1.32 {d2[0]},[r2] @; load 4 bytes (current pixel + 1 extra byte) into D2[0]
314 vadd.i8 d0,d0,d2 @; vector add 3 bytes of previous pixel with
315 @; 3 bytes of current pixel
316 vst1.16 {d0[0]},[r2]! @; store 2 bytes (updated pixel) back
317 @; into row pointer location and increment
318 @; row pointer
319 vst1.8 {d0[2]},[r2]! @; store 1 byte (updated pixel) back
320 @; into row pointer location and increment
321 @; row pointer
322
323 bne sub_filter_3bpp_loop @; loop back until loop counter == rowbytes
324
325 b sub_filter_DONE @; return
326
327 #;; -----------------------------
328 #;; SUB filter, 4 bytes per pixel
329 #;; -----------------------------
330sub_filter_4bpp:
331 vld1.32 {d0[0]},[r2]! @; load 4 bytes (1 pixel) into D0[0]
332 @; increment row pointer
333sub_filter_4bpp_loop: @
334 add r1,r1,r12 @; loop counter += bpp
335 cmp r1,r0 @;
336
337
338 vld1.32 {d2[0]},[r2] @; load 4 bytes (current pixel) into D2[0]
339 vadd.i8 d0,d0,d2 @; vector add 4 bytes of previous pixel with
340 @; 4 bytes of current pixel
341 vst1.32 {d0[0]},[r2]! @; store 4 bytes (updated pixel) back
342 @; into row pointer location and increment
343 @; row pointer
344
345 bne sub_filter_4bpp_loop @; loop back until loop counter == rowbytes
346
347 b sub_filter_DONE @; return
348
349 #;; -----------------------------
350 #;; SUB filter, 6 bytes per pixel
351 #;; -----------------------------
352sub_filter_6bpp:
353 vld1.8 {d0},[r2],r12 @; load 8 bytes (1 pixel + 2 extra bytes) into D0
354 @; increment row pointer by bpp
355sub_filter_6bpp_loop: @
356 add r1,r1,r12 @; loop counter += bpp
357 cmp r1,r0 @;
358
359 vld1.8 {d2},[r2] @; load 8 bytes (1 pixel + 2 extra bytes) into D2
360 vadd.i8 d0,d0,d2 @; vector add 6 bytes of previous pixel with
361 @; 6 bytes of current pixel
362 vst1.32 {d0[0]},[r2]! @; store 4 bytes (updated pixel) back
363 @; into row pointer location and increment
364 @; row pointer
365 vst1.16 {d0[2]},[r2]! @; store 2 bytes (updated pixel) back
366 @; into row pointer location and increment
367 @; row pointer
368
369 bne sub_filter_6bpp_loop @; loop back until loop counter == rowbytes
370
371 b sub_filter_DONE @; return
372
373 #;; -----------------------------
374 #;; SUB filter, 8 bytes per pixel
375 #;; -----------------------------
376sub_filter_8bpp:
377 vld1.8 {d0},[r2]! @; load 8 bytes (1 pixel) into D0
378 @; increment row pointer
379sub_filter_8bpp_loop: @
380 add r1,r1,r12 @; loop counter += bpp
381 cmp r1,r0 @;
382 vld1.8 {d2},[r2] @; load 8 bytes (current pixel) into D2
383 vadd.i8 d0,d0,d2 @; vector add 8 bytes of previous pixel with
384 @; 8 bytes of current pixel
385 vst1.8 {d0},[r2]! @; store 8 bytes (updated pixel) back
386 @; into row pointer location and increment
387 @; row pointer
388
389
390 bne sub_filter_8bpp_loop @; loop back until loop counter == rowbytes
391 @
392 b sub_filter_DONE @ ; return
393
394sub_filter_DONE:
395
396 ldmia sp!, {r4}
397 bx r14
398
399 #;; --------------
400 #;; UP filter type
401 #;; --------------
402up_filter:
403
404 #;; r0 = rowbytes
405 #;; r1 = pixel_depth (not required for UP filter type)
406 #;; r2 = row pointer
407 #;; r3 = previous row pointer
408
409
410 lsrs r1,r0,#5 @; r1 = floor(rowbytes/32)
411 @; = iteration count for loop32
412 beq up_filter_32bytes_proc_done
413
414
415up_filter_32bytes_proc:
416
417
418 mov r12, r2
419
420 vld1.8 {q0},[r3]! @; load 32 bytes from previous
421 vld1.8 {q2},[r3]! @; row and increment pointer
422 @
423 @
424 vld1.8 {q1},[r12]! @; load 32 bytes from current row
425 vld1.8 {q3},[r12]! @
426 @
427 @
428 @
429 vadd.i8 q0,q0,q1 @; vector add of 16 bytes
430 vadd.i8 q2,q2,q3 @
431 @
432 @
433 @
434 vst1.8 {q0},[r2]! @; store 32 bytes to current row
435 vst1.8 {q2},[r2]! @
436 @; and increment pointer
437 sub r0,r0,#32 @; subtract 32 from rowbytes
438 subs r1,r1,#1 @; decrement iteration count
439 bne up_filter_32bytes_proc
440
441
442
443up_filter_32bytes_proc_done:
444
445 lsrs r1,r0,#4 @; r1 = floor(rowbytes/16)
446 @; = iteration count for loop16
447 beq up_filter_16bytes_proc_done
448
449up_filter_16bytes_proc:
450
451 vld1.8 {q0},[r3]! @; load 16 bytes from previous
452 @; row and increment pointer
453 vld1.8 {q1},[r2] @; load 16 bytes from current row
454 vadd.i8 q0,q0,q1 @; vector add of 16 bytes
455 vst1.8 {q0},[r2]! @; store 16 bytes to current row
456 @; and increment pointer
457 sub r0,r0,#16 @; subtract 16 from rowbytes
458 subs r1,r1,#1 @; decrement iteration count
459 bne up_filter_16bytes_proc
460
461up_filter_16bytes_proc_done:
462
463 lsrs r1,r0,#3 @; r1 = floor(rowbytes/8)
464 beq up_filter_8bytes_proc_done
465
466up_filter_8bytes_proc:
467
468 vld1.8 {d0},[r3]! @; load 8 bytes from previous
469 @; row and increment pointer
470 vld1.8 {d2},[r2] @; load 8 bytes from current row
471 vadd.i8 d0,d0,d2 @; vector add 8 bytes
472 vst1.8 {d0},[r2]! @; store 8 bytes to current row
473 @; and increment pointer
474 sub r0,r0,#8 @; subtract 8 from rowbytes
475
476up_filter_8bytes_proc_done:
477
478 lsrs r1,r0,#2 @; r1 = floor(rowbytes/4)
479 beq up_filter_4bytes_proc_done
480
481up_filter_4bytes_proc:
482
483 vld1.32 {d0[0]},[r3]! @; load 4 bytes from previous row
484 @; and increment pointer
485 vld1.32 {d2[0]},[r2] @; load 4 bytes from current row
486 vadd.i8 d0,d0,d2 @; vector add 4 bytes
487 vst1.32 {d0[0]},[r2]! @; store 4 bytes to current row
488 @; and increment pointer
489 sub r0,r0,#4 @; subtract 4 from rowbytes
490
491up_filter_4bytes_proc_done:
492
493 lsrs r1,r0,#1 @; r1 = floor(rowbytes/2)
494 beq up_filter_2bytes_proc_done
495
496up_filter_2bytes_proc:
497
498 vld1.16 {d0[0]},[r3]! @; load 2 bytes from previous row
499 @; and increment pointer
500 vld1.16 {d2[0]},[r2] @; load 2 bytes from current row
501 vadd.i8 d0,d0,d2 @; vector add 2 bytes
502 vst1.16 {d0[0]},[r2]! @; store 2 bytes to current row
503 @; and increment pointer
504 sub r0,r0,#2 @; subtract 2 from rowbytes
505
506up_filter_2bytes_proc_done:
507
508 cmp r0,#0
509 beq up_filter_1byte_proc_done
510
511up_filter_1byte_proc:
512
513 vld1.8 {d0[0]},[r3]! @; load 1 byte from previous row
514 @; and increment pointer
515 vld1.8 {d2[0]},[r2] @; load 1 byte from current row
516 vadd.i8 d0,d0,d2 @; vector add 1 byte
517 vst1.8 {d0[0]},[r2]! @; store 1 byte to current row
518 @; and increment pointer
519up_filter_1byte_proc_done:
520
521 b DONE
522
523 #;; ---------------
524 #;; AVG filter type
525 #;; ---------------
526avg_filter:
527
528 add r1,r1,#7 @; bpp = byptes per pixel
529 lsr r1,r1,#3 @; = (pixel_depth + 7) >> 3
530 mov r12,r1
531
532 #;; r0 = rowbytes
533 #;; r1 = loop counter = bpp (initially)
534 #;; r2 = row pointer
535 #;; r3 = previous row pointer
536 #;; r12 = bpp = loop/pointer increment value
537
538 cmp r12,#1
539 beq avg_filter_1bpp
540
541 cmp r12,#2
542 beq avg_filter_2bpp
543
544 cmp r12,#3
545 beq avg_filter_3bpp
546
547 cmp r12,#4
548 beq avg_filter_4bpp
549
550 cmp r12,#6
551 beq avg_filter_6bpp
552
553 cmp r12,#8
554 beq avg_filter_8bpp
555
556avg_filter_exit:
557 b DONE @; return
558
559 #;; ----------------------------
560 #;; AVG filter, 1 byte per pixel
561 #;; ----------------------------
562avg_filter_1bpp:
563
564 cmp r1,r0
565
566 vld1.8 {d0[0]},[r2] @; load 1 byte (pixel x) from curr
567 @; row into d0[0]
568 vld1.8 {d1[0]},[r3]! @; load 1 byte (pixel b) from prev
569 @; row into d1[0]
570 @; increment prev row pointer
571 vsra.u8 d0,d1,#1 @; shift right pixel b by 1 and add
572 @; to pixel x
573 vst1.8 {d0[0]},[r2]! @; store 1 byte (updated pixel x)
574 @; increment curr row pointer
575 @; updated pixel x is now pixel a
576 beq DONE
577
578avg_filter_1bpp_loop:
579 add r1,r1,r12 @; loop counter += bpp
580 cmp r1,r0
581
582
583 vld1.8 {d2[0]},[r2] @; load 1 byte (pixel x) from curr
584 @; row into d2[0]
585 vld1.8 {d1[0]},[r3]! @; load 1 byte (pixel b) from prev
586 @; row into d1[0]
587 vaddl.u8 q2,d0,d1 @; q2 = (pixel a + pixel b)
588 vshrn.i16 d1,q2,#1 @; d1[0] = (a + b)/2
589 vadd.i8 d0,d2,d1 @; d0[0] = x + ((a + b)/2)
590 vst1.8 {d0[0]},[r2]! @; store 1 byte (updated pixel x)
591 @; increment curr row pointer
592 bne avg_filter_1bpp_loop
593
594 b DONE @; exit loop when
595 @; loop counter == rowbytes
596 #;; -----------------------------
597 #;; AVG filter, 2 bytes per pixel
598 #;; -----------------------------
599avg_filter_2bpp:
600
601 cmp r1,r0
602
603 vld1.16 {d0[0]},[r2] @; load 2 bytes (pixel x) from curr
604 @; row into d0[0]
605 vld1.16 {d1[0]},[r3]! @; load 2 bytes (pixel b) from prev
606 @; row into d1[0]
607 @; increment prev row pointer
608 vsra.u8 d0,d1,#1 @; shift right pixel b by 1 and add
609 @; to pixel x
610 vst1.16 {d0[0]},[r2]! @; store 2 bytes (updated pixel x)
611 @; increment curr row pointer
612 @; updated pixel x is now pixel a
613 beq DONE
614
615avg_filter_2bpp_loop:
616 add r1,r1,r12 @; loop counter += bpp
617 cmp r1,r0
618
619
620 vld1.16 {d2[0]},[r2] @; load 2 bytes (pixel x) from curr
621 @; row into d2[0]
622 vld1.16 {d1[0]},[r3]! @; load 2 bytes (pixel b) from prev
623 @; row into d1[0]
624 vaddl.u8 q2,d0,d1 @; q2 = (pixel a + pixel b)
625 vshrn.i16 d1,q2,#1 @; d1[0] = (a + b)/2
626 vadd.i8 d0,d2,d1 @; d0[0] = x + ((a + b)/2)
627 vst1.16 {d0[0]},[r2]! @; store 2 bytes (updated pixel x)
628 @; increment curr row pointer
629
630 bne avg_filter_2bpp_loop
631
632 b DONE @; exit loop when
633 @; loop counter == rowbytes
634
635 #;; -----------------------------
636 #;; AVG filter, 3 bytes per pixel
637 #;; -----------------------------
638avg_filter_3bpp:
639
640 cmp r1,r0
641
642 vld1.32 {d0[0]},[r2] @; load 4 bytes (pixel x + 1 extra
643 @; byte) from curr row into d0[0]
644 vld1.32 {d1[0]},[r3],r12 @; load 4 bytes (pixel b + 1 extra
645 @; byte) from prev row into d1[0]
646 @; increment prev row pointer
647 vsra.u8 d0,d1,#1 @; shift right pixel b by 1 and add
648 @; to pixel x
649 vst1.16 {d0[0]},[r2]! @; store 2 bytes (updated pixel x)
650 @; increment curr row pointer
651 vst1.8 {d0[2]},[r2]! @; store 1 byte (updated pixel x)
652 @; increment curr row pointer
653 @; updated pixel x is now pixel a
654 beq DONE
655
656avg_filter_3bpp_loop:
657 add r1,r1,r12 @; loop counter += bpp
658 cmp r1,r0
659
660 vld1.32 {d2[0]},[r2] @; load 4 bytes (pixel x + 1 extra
661 @; byte) from curr row into d2[0]
662 vld1.32 {d1[0]},[r3],r12 @; load 4 bytes (pixel b + 1 extra
663 @; byte) from prev row into d1[0]
664 vaddl.u8 q2,d0,d1 @; q2 = (pixel a + pixel b)
665 vshrn.i16 d1,q2,#1 @; d1[0] = (a + b)/2
666 vadd.i8 d0,d2,d1 @; d0[0] = x + ((a + b)/2)
667 vst1.16 {d0[0]},[r2]! @; store 2 bytes (updated pixel x)
668 @; increment curr row pointer
669 vst1.8 {d0[2]},[r2]! @; store 1 byte (updated pixel x)
670 @; increment curr row pointer
671
672 bne avg_filter_3bpp_loop
673
674 b DONE @; exit loop when
675 @; loop counter == rowbytes
676 #;; -----------------------------
677 #;; AVG filter, 4 bytes per pixel
678 #;; -----------------------------
679avg_filter_4bpp:
680
681 cmp r1,r0
682
683 vld1.32 {d0[0]},[r2] @; load 4 bytes (pixel x) from curr
684 @; row into d0[0]
685 vld1.32 {d1[0]},[r3]! @; load 4 bytes (pixel b) from prev
686 @; row into d1[0]
687 @; increment prev row pointer
688 vsra.u8 d0,d1,#1 @; shift right pixel b by 1 and add
689 @; to pixel x
690 vst1.32 {d0[0]},[r2]! @; store 4 bytes (updated pixel x)
691 @; increment curr row pointer
692 @; updated pixel x is now pixel a
693 beq DONE
694
695avg_filter_4bpp_loop:
696 add r1,r1,r12 @; loop counter += bpp
697 cmp r1,r0
698
699
700 vld1.32 {d2[0]},[r2] @; load 4 bytes (pixel x) from curr
701 @; row into d2[0]
702 vld1.32 {d1[0]},[r3]! @; load 4 bytes (pixel b) from prev
703 @; row into d1[0]
704 vaddl.u8 q2,d0,d1 @; q2 = (pixel a + pixel b)
705 vshrn.i16 d1,q2,#1 @; d1[0] = (a + b)/2
706 vadd.i8 d0,d2,d1 @; d0[0] = x + ((a + b)/2)
707 vst1.32 {d0[0]},[r2]! @; store 4 bytes (updated pixel x)
708 @; increment curr row pointer
709 bne avg_filter_4bpp_loop
710
711 b DONE @; exit loop when
712 @; loop counter == rowbytes
713 #;; -----------------------------
714 #;; AVG filter, 6 bytes per pixel
715 #;; -----------------------------
716avg_filter_6bpp:
717
718 cmp r1,r0
719
720 vld1.8 {d0},[r2] @; load 8 bytes (pixel x + 2 extra
721 @; bytes) from curr row into d0
722 vld1.8 {d1},[r3],r12 @; load 8 bytes (pixel b + 2 extra
723 @; bytes) from prev row into d1
724 @; increment prev row pointer
725 vsra.u8 d0,d1,#1 @; shift right pixel b by 1 and add
726 @; to pixel x
727 vst1.32 {d0[0]},[r2]! @; store 4 bytes (updated pixel x)
728 @; increment curr row pointer
729 @; updated pixel x is now pixel a
730 vst1.16 {d0[2]},[r2]! @; store 2 bytes (updated pixel x)
731 @; increment curr row pointer
732 @; updated pixel x is now pixel a
733 beq DONE
734
735avg_filter_6bpp_loop:
736 add r1,r1,r12 @; loop counter += bpp
737 cmp r1,r0
738
739
740 vld1.8 {d2},[r2] @; load 8 bytes (pixel x + 2 extra
741 @; bytes) from curr row into d2
742 vld1.8 {d1},[r3],r12 @; load 8 bytes (pixel b + 2 extra
743 @; bytes) from prev row into d1
744 vaddl.u8 q2,d0,d1 @; q2 = (pixel a + pixel b)
745 vshrn.i16 d1,q2,#1 @; d1 = (a + b)/2
746 vadd.i8 d0,d2,d1 @; d0 = x + ((a + b)/2)
747 vst1.32 {d0[0]},[r2]! @; store 4 bytes (updated pixel x)
748 @; increment curr row pointer
749 vst1.16 {d0[2]},[r2]! @; store 2 bytes (updated pixel x)
750 @; increment curr row pointer
751 bne avg_filter_6bpp_loop
752
753 b DONE @; exit loop when
754 @; loop counter == rowbytes
755 #;; -----------------------------
756 #;; AVG filter, 8 bytes per pixel
757 #;; -----------------------------
758avg_filter_8bpp:
759
760 cmp r1,r0
761
762 vld1.8 {d0},[r2] @; load 8 bytes (pixel x) from curr
763 @; row into d0
764 vld1.8 {d1},[r3]! @; load 8 bytes (pixel b) from prev
765 @; row into d1
766 @; increment prev row pointer
767 vsra.u8 d0,d1,#1 @; shift right pixel b by 1 and add
768 @; to pixel x
769 vst1.8 {d0},[r2]! @; store 8 bytes (updated pixel x)
770 @; increment curr row pointer
771 @; updated pixel x is now pixel a
772 beq DONE
773avg_filter_8bpp_loop:
774 add r1,r1,r12 @; loop counter += bpp
775 cmp r1,r0
776
777
778 vld1.8 {d2},[r2] @; load 8 bytes (pixel x) from curr
779 @; row into d2
780 vld1.8 {d1},[r3]! @; load 8 bytes (pixel b) from prev
781 @; row into d1
782 vaddl.u8 q2,d0,d1 @; q2 = (pixel a + pixel b)
783 vshrn.i16 d1,q2,#1 @; d1 = (a + b)/2
784 vadd.i8 d0,d2,d1 @; d0 = x + ((a + b)/2)
785 vst1.8 {d0},[r2]! @; store 8 bytes (updated pixel x)
786 @; increment curr row pointer
787 bne avg_filter_8bpp_loop
788
789 b DONE @; exit loop when
790 @; loop counter == rowbytes
791 #;; -----------------
792 #;; PAETH filter type
793 #;; -----------------
794paeth_filter:
795
796 VPUSH {q4-q7}
797 add r1,r1,#7 @; bpp = bytes per pixel
798 lsr r1,r1,#3 @; = (pixel_depth + 7) >> 3
799 mov r12,r1
800
801 #;; r0 = rowbytes
802 #;; r1 = loop counter = bpp (initially)
803 #;; r2 = row pointer
804 #;; r3 = previous row pointer
805 #;; r12 = bpp = loop/pointer increment value
806
807
808 cmp r12,#1
809 beq paeth_filter_1bpp
810
811 cmp r12,#2
812 beq paeth_filter_2bpp
813
814 cmp r12,#3
815 beq paeth_filter_3bpp
816
817 cmp r12,#4
818 beq paeth_filter_4bpp
819
820 cmp r12,#6
821 beq paeth_filter_6bpp
822
823 cmp r12,#8
824 beq paeth_filter_8bpp
825
826paeth_filter_exit:
827 b paeth_filter_DONE @; return
828
829 #;; ------------------------------
830 #;; PAETH filter, 1 byte per pixel
831 #;; ------------------------------
832paeth_filter_1bpp:
833
834 cmp r1, r0
835
836 vld1.8 {d0[0]},[r2] @; load 1 byte (pixel x) from curr
837 @; row into d0[0]
838 vld1.8 {d1[0]},[r3]! @; load 1 byte (pixel b) from prev
839 @; row into d1[0]
840 @; increment prev row pointer
841 vadd.i8 d2,d0,d1 @; d2 = x + b = updated pixel x
842 vst1.8 {d2[0]},[r2]! @; store 1 byte (updated pixel x)
843 @; increment curr row pointer
844
845 beq paeth_filter_DONE
846
847paeth_filter_1bpp_loop:
848 add r1,r1,r12 @; increment curr row pointer
849 cmp r1,r0
850
851
852 #;; d1[0] = c (b in the previous loop iteration)
853 #;; d2[0] = a (x in the previous loop iteration)
854 vld1.8 {d3[0]},[r3]! @; load 1 byte (pixel b) from prev
855 @; row into d3[0]
856 vld1.8 {d0[0]},[r2] @; load 1 byte (pixel x) from curr
857 @; row into d0[0]
858 vshll.u8 q4,d1,#1 @; q4 = c<<1 = 2c
859 vabdl.u8 q3,d2,d1 @; q3 = pb = abs(a - c)
860 vabdl.u8 q2,d3,d1 @; q2 = pa = abs(b - c)
861 vaddl.u8 q5,d2,d3 @; q5 = a + b
862 vabd.u16 q4,q5,q4 @; q4 = pc = abs(a + b - 2c)
863
864 vcle.s16 q5,q2,q3 @; q5 = (pa <= pb)
865 vcle.s16 q6,q2,q4 @; q6 = (pa <= pc)
866 vand q5,q5,q6 @; q5 = ((pa <= pb) && (pa <= pc))
867 vcle.s16 q7,q3,q4 @; q7 = (pb <= pc)
868 vshrn.u16 d10,q5,#8 @; d10 = ((pa <= pb) && (pa <= pc))
869 vshrn.u16 d14,q7,#8 @; d14 = (pb <= pc)
870 @
871 vand d2,d2,d10 @; d2 = a where 1, 0 where 0
872 vbsl d14,d3,d1 @; d14 = b where 1, c where 0
873 vmvn d10,d10 @; invert d10
874 vand d14,d14,d10 @; d14 = b/c where 1, 0 where 0
875 vadd.i8 d2,d2,d14 @; d2 = p = a/b/c where appropriate
876 vadd.i8 d2,d2,d0 @; d2 = x + p (updated pixel x)
877 vmov d1,d3 @; d1 = b (c for next iteration)
878 vst1.8 {d2[0]},[r2]! @; store 1 byte (updated pixel x)
879
880
881 bne paeth_filter_1bpp_loop
882
883 b paeth_filter_DONE @; exit loop when
884 @; loop counter == rowbytes
885 #;; -------------------------------
886 #;; PAETH filter, 2 bytes per pixel
887 #;; -------------------------------
888paeth_filter_2bpp:
889
890 cmp r1, r0
891
892 vld1.16 {d0[0]},[r2] @; load 2 bytes (pixel x) from curr
893 @; row into d0[0]
894 vld1.16 {d1[0]},[r3]! @; load 2 bytes (pixel b) from prev
895 @; row into d1[0]
896 @; increment prev row pointer
897 vadd.i8 d2,d0,d1 @; d2 = x + b = updated pixel x
898 vst1.16 {d2[0]},[r2]! @; store 2 bytes (updated pixel x)
899 @; increment curr row pointer
900 beq paeth_filter_DONE
901
902paeth_filter_2bpp_loop:
903 add r1,r1,r12 @; loop counter += bpp
904 cmp r1,r0
905
906 #;; d1[0] = c (b in the previous loop iteration)
907 #;; d2[0] = a (x in the previous loop iteration)
908 vld1.16 {d3[0]},[r3]! @; load 2 bytes (pixel b) from prev
909 @; row into d3[0]
910 vld1.16 {d0[0]},[r2] @; load 2 bytes (pixel x) from curr
911 @; row into d0[0]
912 vshll.u8 q4,d1,#1 @; q4 = c<<1 = 2c
913 vabdl.u8 q3,d2,d1 @; q3 = pb = abs(a - c)
914 vabdl.u8 q2,d3,d1 @; q2 = pa = abs(b - c)
915 vaddl.u8 q5,d2,d3 @; q5 = a + b
916 vabd.u16 q4,q5,q4 @; q4 = pc = abs(a + b - 2c)
917
918 vcle.s16 q5,q2,q3 @; q5 = (pa <= pb)
919 vcle.s16 q6,q2,q4 @; q6 = (pa <= pc)
920 vand q5,q5,q6 @; q5 = ((pa <= pb) && (pa <= pc))
921 vcle.s16 q7,q3,q4 @; q7 = (pb <= pc)
922 vshrn.u16 d10,q5,#8 @; d10 = ((pa <= pb) && (pa <= pc))
923 vshrn.u16 d14,q7,#8 @; d14 = (pb <= pc)
924
925 vand d2,d2,d10 @; d2 = a where 1, 0 where 0
926 vbsl d14,d3,d1 @; d14 = b where 1, c where 0
927 vmvn d10,d10 @; invert d10
928 vand d14,d14,d10 @; d14 = b/c where 1, 0 where 0
929 vadd.i8 d2,d2,d14 @; d2 = p = a/b/c where appropriate
930 vadd.i8 d2,d2,d0 @; d2 = x + p (updated pixel x)
931 vmov d1,d3 @; d1 = b (c for next iteration)
932 vst1.16 {d2[0]},[r2]! @; store 2 bytes (updated pixel x)
933 @; increment curr row pointer
934 bne paeth_filter_2bpp_loop
935
936 b paeth_filter_DONE @; exit loop when
937 @; loop counter == rowbytes
938 #;; -------------------------------
939 #;; PAETH filter, 3 bytes per pixel
940 #;; -------------------------------
941paeth_filter_3bpp:
942
943 cmp r1, r0
944
945 vld1.32 {d0[0]},[r2] @; load 4 bytes (pixel x + 1 extra
946 @; byte) from curr row into d0[0]
947 vld1.32 {d1[0]},[r3],r12 @; load 4 bytes (pixel b + 1 extra
948 @; byte) from prev row into d1[0]
949 @; increment prev row pointer
950 vadd.i8 d2,d0,d1 @; d2 = x + b = updated pixel x
951 vst1.16 {d2[0]},[r2]! @; store 2 bytes (updated pixel x)
952 @; increment curr row pointer
953 vst1.8 {d2[2]},[r2]! @; store 1 byte (updated pixel x)
954 @; increment curr row pointer
955 beq paeth_filter_DONE
956
957paeth_filter_3bpp_loop:
958 add r1,r1,r12 @; loop counter += bpp
959 cmp r1,r0
960
961
962 #;; d1[0] = c (b in the previous loop iteration)
963 #;; d2[0] = a (x in the previous loop iteration)
964 vld1.32 {d3[0]},[r3],r12 @; load 4 bytes (pixel b + 1 extra
965 @; byte) from prev row into d3[0]
966 vld1.32 {d0[0]},[r2] @; load 4 bytes (pixel x + 1 extra
967 @; byte) from curr row into d0[0]
968 vshll.u8 q4,d1,#1 @; q4 = c<<1 = 2c
969 vabdl.u8 q3,d2,d1 @; q3 = pb = abs(a - c)
970 vabdl.u8 q2,d3,d1 @; q2 = pa = abs(b - c)
971 vaddl.u8 q5,d2,d3 @; q5 = a + b
972 vabd.u16 q4,q5,q4 @; q4 = pc = abs(a + b - 2c)
973 @
974 vcle.s16 q5,q2,q3 @; q5 = (pa <= pb)
975 vcle.s16 q6,q2,q4 @; q6 = (pa <= pc)
976 vand q5,q5,q6 @; q5 = ((pa <= pb) && (pa <= pc))
977 vcle.s16 q7,q3,q4 @; q7 = (pb <= pc)
978 vshrn.u16 d10,q5,#8 @; d10 = ((pa <= pb) && (pa <= pc))
979 vshrn.u16 d14,q7,#8 @; d14 = (pb <= pc)
980 @
981 vand d2,d2,d10 @; d2 = a where 1, 0 where 0
982 vbsl d14,d3,d1 @; d14 = b where 1, c where 0
983 vmvn d10,d10 @; invert d10
984 vand d14,d14,d10 @; d14 = b/c where 1, 0 where 0
985 vadd.i8 d2,d2,d14 @; d2 = p = a/b/c where appropriate
986 vadd.i8 d2,d2,d0 @; d2 = x + p (updated pixel x)
987 vmov d1,d3 @; d1 = b (c for next iteration)
988 vst1.16 {d2[0]},[r2]! @; store 2 bytes (updated pixel x)
989 @; increment curr row pointer
990 vst1.8 {d2[2]},[r2]! @; store 1 byte (updated pixel x)
991 @; increment curr row pointer
992 bne paeth_filter_3bpp_loop
993
994 b paeth_filter_DONE @; exit loop when
995 @; loop counter == rowbytes
996 #;; -------------------------------
997 #;; PAETH filter, 4 bytes per pixel
998 #;; -------------------------------
999paeth_filter_4bpp:
1000
1001 cmp r1, r0
1002
1003 vld1.32 {d0[0]},[r2] @; load 4 bytes (pixel x) from curr
1004 @; row into d0[0]
1005 vld1.32 {d1[0]},[r3]! @; load 4 bytes (pixel b) from prev
1006 @; row into d1[0]
1007 @; increment prev row pointer
1008 vadd.i8 d2,d0,d1 @; d2 = x + b = updated pixel x
1009 vst1.32 {d2[0]},[r2]! @; store 4 bytes (updated pixel x)
1010 @; increment curr row pointer
1011 beq paeth_filter_DONE
1012
1013paeth_filter_4bpp_loop:
1014 add r1,r1,r12 @; loop counter += bpp
1015 cmp r1,r0
1016
1017
1018 #;; d1[0] = c (b in the previous loop iteration)
1019 #;; d2[0] = a (x in the previous loop iteration)
1020 vld1.32 {d3[0]},[r3]! @; load 4 bytes (pixel b) from prev
1021 @; row into d3[0]
1022 vld1.32 {d0[0]},[r2] @; load 4 bytes (pixel x) from curr
1023 @; row into d0[0]
1024 vshll.u8 q4,d1,#1 @; q4 = c<<1 = 2c
1025 vabdl.u8 q3,d2,d1 @; q3 = pb = abs(a - c)
1026 vabdl.u8 q2,d3,d1 @; q2 = pa = abs(b - c)
1027 vaddl.u8 q5,d2,d3 @; q5 = a + b
1028 vabd.u16 q4,q5,q4 @; q4 = pc = abs(a + b - 2c)
1029 @
1030 vcle.s16 q5,q2,q3 @; q5 = (pa <= pb)
1031 vcle.s16 q6,q2,q4 @; q6 = (pa <= pc)
1032 vand q5,q5,q6 @; q5 = ((pa <= pb) && (pa <= pc))
1033 vcle.s16 q7,q3,q4 @; q7 = (pb <= pc)
1034 vshrn.u16 d10,q5,#8 @; d10 = ((pa <= pb) && (pa <= pc))
1035 vshrn.u16 d14,q7,#8 @; d14 = (pb <= pc)
1036 @
1037 vand d2,d2,d10 @; d2 = a where 1, 0 where 0
1038 vbsl d14,d3,d1 @; d14 = b where 1, c where 0
1039 vmvn d10,d10 @; invert d10
1040 vand d14,d14,d10 @; d14 = b/c where 1, 0 where 0
1041 vadd.i8 d2,d2,d14 @; d2 = p = a/b/c where appropriate
1042 vadd.i8 d2,d2,d0 @; d2 = x + p (updated pixel x)
1043 vmov d1,d3 @; d1 = b (c for next iteration)
1044 vst1.32 {d2[0]},[r2]! @; store 4 bytes (updated pixel x)
1045 @; increment curr row pointer
1046 bne paeth_filter_4bpp_loop
1047
1048 b paeth_filter_DONE @; exit loop when
1049 @; loop counter == rowbytes
1050 #;; -------------------------------
1051 #;; PAETH filter, 6 bytes per pixel
1052 #;; -------------------------------
1053paeth_filter_6bpp:
1054 cmp r1, r0
1055
1056 vld1.8 {d0},[r2] @; load 8 bytes (pixel x + 2 extra
1057 @; bytes) from curr row into d0
1058 vld1.8 {d1},[r3],r12 @; load 8 bytes (pixel b + 2 extra
1059 @; bytes) from prev row into d1
1060 @; increment prev row pointer
1061 vadd.i8 d2,d0,d1 @; d2 = x + b = updated pixel x
1062 vst1.32 {d2[0]},[r2]! @; store 4 bytes (updated pixel x)
1063 @; increment curr row pointer
1064 vst1.16 {d2[2]},[r2]! @; store 2 bytes (updated pixel x)
1065 @; increment curr row pointer
1066 beq paeth_filter_DONE
1067
1068paeth_filter_6bpp_loop:
1069 add r1,r1,r12 @; loop counter += bpp
1070 cmp r1,r0
1071
1072
1073 #;; d1[0] = c (b in the previous loop iteration)
1074 #;; d2[0] = a (x in the previous loop iteration)
1075 vld1.8 {d3},[r3],r12 @; load 8 bytes (pixel b + 2 extra
1076 @; bytes) from prev row into d3
1077 vld1.8 {d0},[r2] @; load 8 bytes (pixel x + 2 extra
1078 @; bytes) from curr row into d0
1079 vshll.u8 q4,d1,#1 @; q4 = c<<1 = 2c
1080 vabdl.u8 q3,d2,d1 @; q3 = pb = abs(a - c)
1081 vabdl.u8 q2,d3,d1 @; q2 = pa = abs(b - c)
1082 vaddl.u8 q5,d2,d3 @; q5 = a + b
1083 vabd.u16 q4,q5,q4 @; q4 = pc = abs(a + b - 2c)
1084
1085 vcle.s16 q5,q2,q3 @; q5 = (pa <= pb)
1086 vcle.s16 q6,q2,q4 @; q6 = (pa <= pc)
1087 vand q5,q5,q6 @; q5 = ((pa <= pb) && (pa <= pc))
1088 vcle.s16 q7,q3,q4 @; q7 = (pb <= pc)
1089 vshrn.u16 d10,q5,#8 @; d10 = ((pa <= pb) && (pa <= pc))
1090 vshrn.u16 d14,q7,#8 @; d14 = (pb <= pc)
1091
1092 vand d2,d2,d10 @; d2 = a where 1, 0 where 0
1093 vbsl d14,d3,d1 @; d14 = b where 1, c where 0
1094 vmvn d10,d10 @; invert d10
1095 vand d14,d14,d10 @; d14 = b/c where 1, 0 where 0
1096 vadd.i8 d2,d2,d14 @; d2 = p = a/b/c where appropriate
1097 vadd.i8 d2,d2,d0 @; d2 = x + p (updated pixel x)
1098 vmov d1,d3 @; d1 = b (c for next iteration)
1099 vst1.32 {d2[0]},[r2]! @; store 4 bytes (updated pixel x)
1100 @; increment curr row pointer
1101 vst1.16 {d2[2]},[r2]! @; store 2 bytes (updated pixel x)
1102 @; increment curr row pointer
1103 bne paeth_filter_6bpp_loop
1104
1105 b paeth_filter_DONE @; exit loop when
1106 @; loop counter == rowbytes
1107 #;; -------------------------------
1108 #;; PAETH filter, 8 bytes per pixel
1109 #;; -------------------------------
1110paeth_filter_8bpp:
1111 cmp r1, r0
1112
1113 vld1.8 {d0},[r2] @; load 8 bytes (pixel x) from curr
1114 @; row into d0
1115 vld1.8 {d1},[r3]! @; load 8 bytes (pixel b) from prev
1116 @; row into d1
1117 @; increment prev row pointer
1118 vadd.i8 d2,d0,d1 @; d2 = x + b = updated pixel x
1119 vst1.8 {d2},[r2]! @; store 8 bytes (updated pixel x)
1120 @; increment curr row pointer
1121 beq paeth_filter_DONE
1122
1123paeth_filter_8bpp_loop:
1124 add r1,r1,r12 @; loop counter += bpp
1125 cmp r1,r0
1126
1127
1128 #;; d1[0] = c (b in the previous loop iteration)
1129 #;; d2[0] = a (x in the previous loop iteration)
1130 vld1.8 {d3},[r3]! @; load 8 bytes (pixel b) from prev
1131 @; row into d3
1132 vld1.8 {d0},[r2] @; load 8 bytes (pixel x) from curr
1133 @; row into d0
1134 vshll.u8 q4,d1,#1 @; q4 = c<<1 = 2c
1135 vabdl.u8 q3,d2,d1 @; q3 = pb = abs(a - c)
1136 vabdl.u8 q2,d3,d1 @; q2 = pa = abs(b - c)
1137 vaddl.u8 q5,d2,d3 @; q5 = a + b
1138 vabd.u16 q4,q5,q4 @; q4 = pc = abs(a + b - 2c)
1139 @
1140 vcle.s16 q5,q2,q3 @; q5 = (pa <= pb)
1141 vcle.s16 q6,q2,q4 @; q6 = (pa <= pc)
1142 vand q5,q5,q6 @; q5 = ((pa <= pb) && (pa <= pc))
1143 vcle.s16 q7,q3,q4 @; q7 = (pb <= pc)
1144 vshrn.u16 d10,q5,#8 @; d10 = ((pa <= pb) && (pa <= pc))
1145 vshrn.u16 d14,q7,#8 @; d14 = (pb <= pc)
1146 @
1147 vand d2,d2,d10 @; d2 = a where 1, 0 where 0
1148 vbsl d14,d3,d1 @; d14 = b where 1, c where 0
1149 vmvn d10,d10 @; invert d10
1150 vand d14,d14,d10 @; d14 = b/c where 1, 0 where 0
1151 vadd.i8 d2,d2,d14 @; d2 = p = a/b/c where appropriate
1152 vadd.i8 d2,d2,d0 @; d2 = x + p (updated pixel x)
1153 vmov d1,d3 @; d1 = b (c for next iteration)
1154 vst1.8 {d2},[r2]! @; store 8 bytes (updated pixel x)
1155 @; increment curr row pointer
1156 bne paeth_filter_8bpp_loop
1157
1158 b paeth_filter_DONE @; exit loop when
1159 @; loop counter == rowbytes
1160paeth_filter_DONE:
1161
1162 VPOP {q4-q7}
1163 bx r14
1164
1165DONE:
1166 bx r14
1167
1168
1169.size png_read_filter_row_neon, .-png_read_filter_row_neon
1170 .END