blob: 1a45745aed005d8d58f9f9479b26a184c7558c1c [file] [log] [blame]
#; Copyright (c) 2010-2011, Code Aurora Forum. All rights reserved.
#;
#; Redistribution and use in source and binary forms, with or without
#; modification, are permitted provided that the following conditions are
#; met:
#; * Redistributions of source code must retain the above copyright
#; notice, this list of conditions and the following disclaimer.
#; * Redistributions in binary form must reproduce the above
#; copyright notice, this list of conditions and the following
#; disclaimer in the documentation and/or other materials provided
#; with the distribution.
#; * Neither the name of Code Aurora Forum, Inc. nor the names of its
#; contributors may be used to endorse or promote products derived
#; from this software without specific prior written permission.
#;
#; THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED
#; WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
#; MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT
#; ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
#; BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
#; CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
#; SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
#; BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
#; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
#; OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
#; IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#;==============================================================================
.code 32 @; Code is ARM ISA
#;==============================================================================
.global png_read_filter_row_neon
#;==============================================================================
#; INPUTS: r0 rowbytes: number of bytes in current row
#; r1 pixel_depth: number of bits per pixel
#; r2 row: pointer to start of current row
#; r3 prev_row: pointer to start of previous row
#; [sp,#0] filter: filter type
#;
#; NOTE: Don't touch r5-r11
#;==============================================================================
.balign 32
.type png_read_filter_row_neon, %function
png_read_filter_row_neon:
ldr r12,[sp,#0]
cmp r12,#0
beq DONE
cmp r12,#1
beq sub_filter
cmp r12,#2
beq up_filter
cmp r12,#3
beq avg_filter
cmp r12,#4
beq paeth_filter
b DONE
#;; ---------------
#;; SUB filter type
#;; ---------------
sub_filter:
stmdb sp!, {r4}
add r1,r1,#7 @; bpp = bytes per pixel
lsr r1,r1,#3 @; = (pixel_depth + 7) >> 3
mov r12,r1
#;; r0 = rowbytes
#;; r1 = loop counter = bpp (initially)
#;; r2 = row pointer
#;; r12 = bpp = loop/pointer increment value
cmp r1,r0
beq sub_filter_exit @; exit if bpp == rowbytes
cmp r12,#1
beq sub_filter_1bpp
cmp r12,#2
beq sub_filter_2bpp
cmp r12,#3
beq sub_filter_3bpp
cmp r12,#4
beq sub_filter_4bpp
cmp r12,#6
beq sub_filter_6bpp
cmp r12,#8
beq sub_filter_8bpp
sub_filter_exit:
b sub_filter_DONE @; return
sub_filter_1bpp:
#;; ----------------------------
#;; SUB filter, 1 byte per pixel
#;; ----------------------------
lsrs r4,r0,#4 @; r1 = floor(rowbytes/4)
@; = iteration count for loop16
beq sub_filter_1bpp_16bytes_done
vmov.i8 d21, #0
vld1.8 {d16,d17}, [r2] @; load 16 pixels
@; d16 = a b c d e f g h
@; d17 = i j k l m n o p
mov r1, #0
sub_filter_1bpp_16bytes:
vshl.i64 d18, d16, #8 @; d18 = 0 a b c d e f g
vadd.i8 d18, d16, d18 @; d18 = a a+b b+c c+d d+e e+f f+g g+h
vshl.i64 d18, d18, #8 @; d18 = 0 a a+b b+c c+d d+e e+f f+g
vadd.i8 d18, d16, d18 @; d18 = a a+b a+b+c b+c+d c+d+e d+e+f e+f+g f+g+h
vshl.i64 d18, d18, #8 @; shift add continuously to propage the sum of previous
vadd.i8 d18, d16, d18 @; and current pixels
vshl.i64 d18, d18, #8
vadd.i8 d18, d16, d18
vshl.i64 d18, d18, #8
vadd.i8 d18, d16, d18
vshl.i64 d18, d18, #8
vadd.i8 d18, d16, d18
vshl.i64 d18, d18, #8
vadd.i8 d18, d16, d18 @; maximum data size for shift is 64 bits i.e. doubleword.
@; after computing thh value of all the pixels in the double word
@; extract the last computed value which will be used by
@; the next set of pixels (i.e next doubleword)
vext.8 d22, d18, d21, #7 @; extract the updated value of d18[7] i.e a+b+c+d+e+f+h
vadd.i8 d17, d17, d22 @; d17 = a+b+c+d+e+f+g+h+i j k l m n o p
vshl.i64 d19, d17, #8 @; continue shift-add as the first half
vadd.i8 d19, d17, d19
vshl.i64 d19, d19, #8
vadd.i8 d19, d17, d19
vshl.i64 d19, d19, #8
vadd.i8 d19, d17, d19
vshl.i64 d19, d19, #8
vadd.i8 d19, d17, d19
vshl.i64 d19, d19, #8
vadd.i8 d19, d17, d19
vshl.i64 d19, d19, #8
vadd.i8 d19, d17, d19
vshl.i64 d19, d19, #8
vadd.i8 d19, d17, d19
vst1.8 {d18,d19},[r2]! @; store the result back
add r1, r1, #16 @; add 16 to the loop counter(no of bytes completed)
subs r4,r4,#1 @; decrement iteration count
beq sub_filter_1bpp_16bytes_adjust
vext.8 d22, d19, d21, #7 @; more iterations to go
@; extract the last computed value
vld1.8 {d16,d17}, [r2] @; load the next 16 bytes
vadd.i8 d16, d16, d22 @; set up the input by adding the previous pixel
@; value to the input
b sub_filter_1bpp_16bytes
sub_filter_1bpp_16bytes_adjust:
cmp r1, r0 @; no more pixels left .. exit
sub r2, r2, #1 @; more pixels remaining
@; r2 points to the current pixel adjust it
@; so that it points to the prev pixel for the below loop
beq sub_filter_DONE
sub_filter_1bpp_16bytes_done:
vld1.8 {d0[0]},[r2]! @; load 1 byte (1 pixel) into D0[0]
@; increment row pointer
sub_filter_1bpp_loop:
add r1,r1,r12 @; loop counter += bpp
cmp r1,r0 @;
vld1.8 {d2[0]},[r2] @; load 1 byte (current pixel) into D2[0]
vadd.i8 d0,d0,d2 @; vector add 1 byte of previous pixel with
@; 1 byte of current pixel
vst1.8 {d0[0]},[r2]! @; store 1 byte (updated pixel) back
@; into row pointer location and increment
@; row pointer
bne sub_filter_1bpp_loop @; loop back until loop counter == rowbytes
b sub_filter_DONE @; return
#;; -----------------------------
#;; SUB filter, 2 bytes per pixel
#;; -----------------------------
sub_filter_2bpp:
lsrs r4,r0,#4 @; r1 = floor(rowbytes/4)
@; = iteration count for loop16
beq sub_filter_2bpp_16bytes_done
vmov.i8 d21, #0
vld1.8 {d16,d17}, [r2] @; load 16 bytes to q8
@; d16 = a b c d e f g h
@; d17 = i j k l m n o p
mov r1, #0
sub_filter_2bpp_16bytes:
vshl.i64 d18, d16, #16 @; each pixel is 2bytes .. shift by 16 bits to get previous pixel
vadd.i8 d18, d16, d18 @; add to the current pixel
vshl.i64 d18, d18, #16 @; shift-add to propagate the computed sum as the case for 1bpp
vadd.i8 d18, d16, d18
vshl.i64 d18, d18, #16
vadd.i8 d18, d16, d18
vext.8 d22, d18, d21, #6 @; extract the last computed value (i.e. last 2 bytes)
vadd.i8 d17, d17, d22 @; add the last computed pixel to the input
vshl.i64 d19, d17, #16
vadd.i8 d19, d17, d19
vshl.i64 d19, d19, #16
vadd.i8 d19, d17, d19
vshl.i64 d19, d19, #16
vadd.i8 d19, d17, d19
vst1.8 {d18,d19},[r2]! @; store the result back
add r1, r1, #16 @; add 16 to the loop counter(no of bytes completed)
subs r4,r4,#1 @; decrement iteration count
beq sub_filter_2bpp_16bytes_adjust
vext.8 d22, d19, d21, #6 @; extract the last computed value
@; add the last computed pixel to the input
vld1.8 {d16,d17}, [r2]
vadd.i8 d16, d16, d22
b sub_filter_2bpp_16bytes
sub_filter_2bpp_16bytes_adjust:
cmp r1, r0 @; no more pixels left .. exit
sub r2, r2, #2 @; more pixels remaining
@; r2 points to the current pixel adjust it
@; so that it points to the prev pixel for the below loop
beq sub_filter_DONE
sub_filter_2bpp_16bytes_done:
vld1.16 {d0[0]},[r2]! @; load 2 bytes (1 pixel) into D0[0]
@; increment row pointer
sub_filter_2bpp_loop:
add r1,r1,r12 @; loop counter += bpp
cmp r1,r0 @;
vld1.16 {d2[0]},[r2] @; load 2 bytes (current pixel) into D2[0]
vadd.i8 d0,d0,d2 @; vector add 2 bytes of previous pixel with
@; 2 bytes of current pixel
vst1.16 {d0[0]},[r2]! @; store 2 bytes (updated pixel) back
@; into row pointer location and increment
@; row pointer
bne sub_filter_2bpp_loop @; loop back until loop counter == rowbytes
@
b sub_filter_DONE @ ; return
#;; -----------------------------
#;; SUB filter, 3 bytes per pixel
#;; -----------------------------
sub_filter_3bpp:
vld1.32 {d0[0]},[r2], r12 @; load 4 bytes (1 pixel + 1 extra byte) into D0[0]
@; increment row pointer by bpp
sub_filter_3bpp_loop:
add r1,r1,r12 @; loop counter += bpp
cmp r1,r0 @;
vld1.32 {d2[0]},[r2] @; load 4 bytes (current pixel + 1 extra byte) into D2[0]
vadd.i8 d0,d0,d2 @; vector add 3 bytes of previous pixel with
@; 3 bytes of current pixel
vst1.16 {d0[0]},[r2]! @; store 2 bytes (updated pixel) back
@; into row pointer location and increment
@; row pointer
vst1.8 {d0[2]},[r2]! @; store 1 byte (updated pixel) back
@; into row pointer location and increment
@; row pointer
bne sub_filter_3bpp_loop @; loop back until loop counter == rowbytes
b sub_filter_DONE @; return
#;; -----------------------------
#;; SUB filter, 4 bytes per pixel
#;; -----------------------------
sub_filter_4bpp:
vld1.32 {d0[0]},[r2]! @; load 4 bytes (1 pixel) into D0[0]
@; increment row pointer
sub_filter_4bpp_loop: @
add r1,r1,r12 @; loop counter += bpp
cmp r1,r0 @;
vld1.32 {d2[0]},[r2] @; load 4 bytes (current pixel) into D2[0]
vadd.i8 d0,d0,d2 @; vector add 4 bytes of previous pixel with
@; 4 bytes of current pixel
vst1.32 {d0[0]},[r2]! @; store 4 bytes (updated pixel) back
@; into row pointer location and increment
@; row pointer
bne sub_filter_4bpp_loop @; loop back until loop counter == rowbytes
b sub_filter_DONE @; return
#;; -----------------------------
#;; SUB filter, 6 bytes per pixel
#;; -----------------------------
sub_filter_6bpp:
vld1.8 {d0},[r2],r12 @; load 8 bytes (1 pixel + 2 extra bytes) into D0
@; increment row pointer by bpp
sub_filter_6bpp_loop: @
add r1,r1,r12 @; loop counter += bpp
cmp r1,r0 @;
vld1.8 {d2},[r2] @; load 8 bytes (1 pixel + 2 extra bytes) into D2
vadd.i8 d0,d0,d2 @; vector add 6 bytes of previous pixel with
@; 6 bytes of current pixel
vst1.32 {d0[0]},[r2]! @; store 4 bytes (updated pixel) back
@; into row pointer location and increment
@; row pointer
vst1.16 {d0[2]},[r2]! @; store 2 bytes (updated pixel) back
@; into row pointer location and increment
@; row pointer
bne sub_filter_6bpp_loop @; loop back until loop counter == rowbytes
b sub_filter_DONE @; return
#;; -----------------------------
#;; SUB filter, 8 bytes per pixel
#;; -----------------------------
sub_filter_8bpp:
vld1.8 {d0},[r2]! @; load 8 bytes (1 pixel) into D0
@; increment row pointer
sub_filter_8bpp_loop: @
add r1,r1,r12 @; loop counter += bpp
cmp r1,r0 @;
vld1.8 {d2},[r2] @; load 8 bytes (current pixel) into D2
vadd.i8 d0,d0,d2 @; vector add 8 bytes of previous pixel with
@; 8 bytes of current pixel
vst1.8 {d0},[r2]! @; store 8 bytes (updated pixel) back
@; into row pointer location and increment
@; row pointer
bne sub_filter_8bpp_loop @; loop back until loop counter == rowbytes
@
b sub_filter_DONE @ ; return
sub_filter_DONE:
ldmia sp!, {r4}
bx r14
#;; --------------
#;; UP filter type
#;; --------------
up_filter:
#;; r0 = rowbytes
#;; r1 = pixel_depth (not required for UP filter type)
#;; r2 = row pointer
#;; r3 = previous row pointer
lsrs r1,r0,#5 @; r1 = floor(rowbytes/32)
@; = iteration count for loop32
beq up_filter_32bytes_proc_done
up_filter_32bytes_proc:
mov r12, r2
vld1.8 {q0},[r3]! @; load 32 bytes from previous
vld1.8 {q2},[r3]! @; row and increment pointer
@
@
vld1.8 {q1},[r12]! @; load 32 bytes from current row
vld1.8 {q3},[r12]! @
@
@
@
vadd.i8 q0,q0,q1 @; vector add of 16 bytes
vadd.i8 q2,q2,q3 @
@
@
@
vst1.8 {q0},[r2]! @; store 32 bytes to current row
vst1.8 {q2},[r2]! @
@; and increment pointer
sub r0,r0,#32 @; subtract 32 from rowbytes
subs r1,r1,#1 @; decrement iteration count
bne up_filter_32bytes_proc
up_filter_32bytes_proc_done:
lsrs r1,r0,#4 @; r1 = floor(rowbytes/16)
@; = iteration count for loop16
beq up_filter_16bytes_proc_done
up_filter_16bytes_proc:
vld1.8 {q0},[r3]! @; load 16 bytes from previous
@; row and increment pointer
vld1.8 {q1},[r2] @; load 16 bytes from current row
vadd.i8 q0,q0,q1 @; vector add of 16 bytes
vst1.8 {q0},[r2]! @; store 16 bytes to current row
@; and increment pointer
sub r0,r0,#16 @; subtract 16 from rowbytes
subs r1,r1,#1 @; decrement iteration count
bne up_filter_16bytes_proc
up_filter_16bytes_proc_done:
lsrs r1,r0,#3 @; r1 = floor(rowbytes/8)
beq up_filter_8bytes_proc_done
up_filter_8bytes_proc:
vld1.8 {d0},[r3]! @; load 8 bytes from previous
@; row and increment pointer
vld1.8 {d2},[r2] @; load 8 bytes from current row
vadd.i8 d0,d0,d2 @; vector add 8 bytes
vst1.8 {d0},[r2]! @; store 8 bytes to current row
@; and increment pointer
sub r0,r0,#8 @; subtract 8 from rowbytes
up_filter_8bytes_proc_done:
lsrs r1,r0,#2 @; r1 = floor(rowbytes/4)
beq up_filter_4bytes_proc_done
up_filter_4bytes_proc:
vld1.32 {d0[0]},[r3]! @; load 4 bytes from previous row
@; and increment pointer
vld1.32 {d2[0]},[r2] @; load 4 bytes from current row
vadd.i8 d0,d0,d2 @; vector add 4 bytes
vst1.32 {d0[0]},[r2]! @; store 4 bytes to current row
@; and increment pointer
sub r0,r0,#4 @; subtract 4 from rowbytes
up_filter_4bytes_proc_done:
lsrs r1,r0,#1 @; r1 = floor(rowbytes/2)
beq up_filter_2bytes_proc_done
up_filter_2bytes_proc:
vld1.16 {d0[0]},[r3]! @; load 2 bytes from previous row
@; and increment pointer
vld1.16 {d2[0]},[r2] @; load 2 bytes from current row
vadd.i8 d0,d0,d2 @; vector add 2 bytes
vst1.16 {d0[0]},[r2]! @; store 2 bytes to current row
@; and increment pointer
sub r0,r0,#2 @; subtract 2 from rowbytes
up_filter_2bytes_proc_done:
cmp r0,#0
beq up_filter_1byte_proc_done
up_filter_1byte_proc:
vld1.8 {d0[0]},[r3]! @; load 1 byte from previous row
@; and increment pointer
vld1.8 {d2[0]},[r2] @; load 1 byte from current row
vadd.i8 d0,d0,d2 @; vector add 1 byte
vst1.8 {d0[0]},[r2]! @; store 1 byte to current row
@; and increment pointer
up_filter_1byte_proc_done:
b DONE
#;; ---------------
#;; AVG filter type
#;; ---------------
avg_filter:
add r1,r1,#7 @; bpp = byptes per pixel
lsr r1,r1,#3 @; = (pixel_depth + 7) >> 3
mov r12,r1
#;; r0 = rowbytes
#;; r1 = loop counter = bpp (initially)
#;; r2 = row pointer
#;; r3 = previous row pointer
#;; r12 = bpp = loop/pointer increment value
cmp r12,#1
beq avg_filter_1bpp
cmp r12,#2
beq avg_filter_2bpp
cmp r12,#3
beq avg_filter_3bpp
cmp r12,#4
beq avg_filter_4bpp
cmp r12,#6
beq avg_filter_6bpp
cmp r12,#8
beq avg_filter_8bpp
avg_filter_exit:
b DONE @; return
#;; ----------------------------
#;; AVG filter, 1 byte per pixel
#;; ----------------------------
avg_filter_1bpp:
cmp r1,r0
vld1.8 {d0[0]},[r2] @; load 1 byte (pixel x) from curr
@; row into d0[0]
vld1.8 {d1[0]},[r3]! @; load 1 byte (pixel b) from prev
@; row into d1[0]
@; increment prev row pointer
vsra.u8 d0,d1,#1 @; shift right pixel b by 1 and add
@; to pixel x
vst1.8 {d0[0]},[r2]! @; store 1 byte (updated pixel x)
@; increment curr row pointer
@; updated pixel x is now pixel a
beq DONE
avg_filter_1bpp_loop:
add r1,r1,r12 @; loop counter += bpp
cmp r1,r0
vld1.8 {d2[0]},[r2] @; load 1 byte (pixel x) from curr
@; row into d2[0]
vld1.8 {d1[0]},[r3]! @; load 1 byte (pixel b) from prev
@; row into d1[0]
vaddl.u8 q2,d0,d1 @; q2 = (pixel a + pixel b)
vshrn.i16 d1,q2,#1 @; d1[0] = (a + b)/2
vadd.i8 d0,d2,d1 @; d0[0] = x + ((a + b)/2)
vst1.8 {d0[0]},[r2]! @; store 1 byte (updated pixel x)
@; increment curr row pointer
bne avg_filter_1bpp_loop
b DONE @; exit loop when
@; loop counter == rowbytes
#;; -----------------------------
#;; AVG filter, 2 bytes per pixel
#;; -----------------------------
avg_filter_2bpp:
cmp r1,r0
vld1.16 {d0[0]},[r2] @; load 2 bytes (pixel x) from curr
@; row into d0[0]
vld1.16 {d1[0]},[r3]! @; load 2 bytes (pixel b) from prev
@; row into d1[0]
@; increment prev row pointer
vsra.u8 d0,d1,#1 @; shift right pixel b by 1 and add
@; to pixel x
vst1.16 {d0[0]},[r2]! @; store 2 bytes (updated pixel x)
@; increment curr row pointer
@; updated pixel x is now pixel a
beq DONE
avg_filter_2bpp_loop:
add r1,r1,r12 @; loop counter += bpp
cmp r1,r0
vld1.16 {d2[0]},[r2] @; load 2 bytes (pixel x) from curr
@; row into d2[0]
vld1.16 {d1[0]},[r3]! @; load 2 bytes (pixel b) from prev
@; row into d1[0]
vaddl.u8 q2,d0,d1 @; q2 = (pixel a + pixel b)
vshrn.i16 d1,q2,#1 @; d1[0] = (a + b)/2
vadd.i8 d0,d2,d1 @; d0[0] = x + ((a + b)/2)
vst1.16 {d0[0]},[r2]! @; store 2 bytes (updated pixel x)
@; increment curr row pointer
bne avg_filter_2bpp_loop
b DONE @; exit loop when
@; loop counter == rowbytes
#;; -----------------------------
#;; AVG filter, 3 bytes per pixel
#;; -----------------------------
avg_filter_3bpp:
cmp r1,r0
vld1.32 {d0[0]},[r2] @; load 4 bytes (pixel x + 1 extra
@; byte) from curr row into d0[0]
vld1.32 {d1[0]},[r3],r12 @; load 4 bytes (pixel b + 1 extra
@; byte) from prev row into d1[0]
@; increment prev row pointer
vsra.u8 d0,d1,#1 @; shift right pixel b by 1 and add
@; to pixel x
vst1.16 {d0[0]},[r2]! @; store 2 bytes (updated pixel x)
@; increment curr row pointer
vst1.8 {d0[2]},[r2]! @; store 1 byte (updated pixel x)
@; increment curr row pointer
@; updated pixel x is now pixel a
beq DONE
avg_filter_3bpp_loop:
add r1,r1,r12 @; loop counter += bpp
cmp r1,r0
vld1.32 {d2[0]},[r2] @; load 4 bytes (pixel x + 1 extra
@; byte) from curr row into d2[0]
vld1.32 {d1[0]},[r3],r12 @; load 4 bytes (pixel b + 1 extra
@; byte) from prev row into d1[0]
vaddl.u8 q2,d0,d1 @; q2 = (pixel a + pixel b)
vshrn.i16 d1,q2,#1 @; d1[0] = (a + b)/2
vadd.i8 d0,d2,d1 @; d0[0] = x + ((a + b)/2)
vst1.16 {d0[0]},[r2]! @; store 2 bytes (updated pixel x)
@; increment curr row pointer
vst1.8 {d0[2]},[r2]! @; store 1 byte (updated pixel x)
@; increment curr row pointer
bne avg_filter_3bpp_loop
b DONE @; exit loop when
@; loop counter == rowbytes
#;; -----------------------------
#;; AVG filter, 4 bytes per pixel
#;; -----------------------------
avg_filter_4bpp:
cmp r1,r0
vld1.32 {d0[0]},[r2] @; load 4 bytes (pixel x) from curr
@; row into d0[0]
vld1.32 {d1[0]},[r3]! @; load 4 bytes (pixel b) from prev
@; row into d1[0]
@; increment prev row pointer
vsra.u8 d0,d1,#1 @; shift right pixel b by 1 and add
@; to pixel x
vst1.32 {d0[0]},[r2]! @; store 4 bytes (updated pixel x)
@; increment curr row pointer
@; updated pixel x is now pixel a
beq DONE
avg_filter_4bpp_loop:
add r1,r1,r12 @; loop counter += bpp
cmp r1,r0
vld1.32 {d2[0]},[r2] @; load 4 bytes (pixel x) from curr
@; row into d2[0]
vld1.32 {d1[0]},[r3]! @; load 4 bytes (pixel b) from prev
@; row into d1[0]
vaddl.u8 q2,d0,d1 @; q2 = (pixel a + pixel b)
vshrn.i16 d1,q2,#1 @; d1[0] = (a + b)/2
vadd.i8 d0,d2,d1 @; d0[0] = x + ((a + b)/2)
vst1.32 {d0[0]},[r2]! @; store 4 bytes (updated pixel x)
@; increment curr row pointer
bne avg_filter_4bpp_loop
b DONE @; exit loop when
@; loop counter == rowbytes
#;; -----------------------------
#;; AVG filter, 6 bytes per pixel
#;; -----------------------------
avg_filter_6bpp:
cmp r1,r0
vld1.8 {d0},[r2] @; load 8 bytes (pixel x + 2 extra
@; bytes) from curr row into d0
vld1.8 {d1},[r3],r12 @; load 8 bytes (pixel b + 2 extra
@; bytes) from prev row into d1
@; increment prev row pointer
vsra.u8 d0,d1,#1 @; shift right pixel b by 1 and add
@; to pixel x
vst1.32 {d0[0]},[r2]! @; store 4 bytes (updated pixel x)
@; increment curr row pointer
@; updated pixel x is now pixel a
vst1.16 {d0[2]},[r2]! @; store 2 bytes (updated pixel x)
@; increment curr row pointer
@; updated pixel x is now pixel a
beq DONE
avg_filter_6bpp_loop:
add r1,r1,r12 @; loop counter += bpp
cmp r1,r0
vld1.8 {d2},[r2] @; load 8 bytes (pixel x + 2 extra
@; bytes) from curr row into d2
vld1.8 {d1},[r3],r12 @; load 8 bytes (pixel b + 2 extra
@; bytes) from prev row into d1
vaddl.u8 q2,d0,d1 @; q2 = (pixel a + pixel b)
vshrn.i16 d1,q2,#1 @; d1 = (a + b)/2
vadd.i8 d0,d2,d1 @; d0 = x + ((a + b)/2)
vst1.32 {d0[0]},[r2]! @; store 4 bytes (updated pixel x)
@; increment curr row pointer
vst1.16 {d0[2]},[r2]! @; store 2 bytes (updated pixel x)
@; increment curr row pointer
bne avg_filter_6bpp_loop
b DONE @; exit loop when
@; loop counter == rowbytes
#;; -----------------------------
#;; AVG filter, 8 bytes per pixel
#;; -----------------------------
avg_filter_8bpp:
cmp r1,r0
vld1.8 {d0},[r2] @; load 8 bytes (pixel x) from curr
@; row into d0
vld1.8 {d1},[r3]! @; load 8 bytes (pixel b) from prev
@; row into d1
@; increment prev row pointer
vsra.u8 d0,d1,#1 @; shift right pixel b by 1 and add
@; to pixel x
vst1.8 {d0},[r2]! @; store 8 bytes (updated pixel x)
@; increment curr row pointer
@; updated pixel x is now pixel a
beq DONE
avg_filter_8bpp_loop:
add r1,r1,r12 @; loop counter += bpp
cmp r1,r0
vld1.8 {d2},[r2] @; load 8 bytes (pixel x) from curr
@; row into d2
vld1.8 {d1},[r3]! @; load 8 bytes (pixel b) from prev
@; row into d1
vaddl.u8 q2,d0,d1 @; q2 = (pixel a + pixel b)
vshrn.i16 d1,q2,#1 @; d1 = (a + b)/2
vadd.i8 d0,d2,d1 @; d0 = x + ((a + b)/2)
vst1.8 {d0},[r2]! @; store 8 bytes (updated pixel x)
@; increment curr row pointer
bne avg_filter_8bpp_loop
b DONE @; exit loop when
@; loop counter == rowbytes
#;; -----------------
#;; PAETH filter type
#;; -----------------
paeth_filter:
VPUSH {q4-q7}
add r1,r1,#7 @; bpp = bytes per pixel
lsr r1,r1,#3 @; = (pixel_depth + 7) >> 3
mov r12,r1
#;; r0 = rowbytes
#;; r1 = loop counter = bpp (initially)
#;; r2 = row pointer
#;; r3 = previous row pointer
#;; r12 = bpp = loop/pointer increment value
cmp r12,#1
beq paeth_filter_1bpp
cmp r12,#2
beq paeth_filter_2bpp
cmp r12,#3
beq paeth_filter_3bpp
cmp r12,#4
beq paeth_filter_4bpp
cmp r12,#6
beq paeth_filter_6bpp
cmp r12,#8
beq paeth_filter_8bpp
paeth_filter_exit:
b paeth_filter_DONE @; return
#;; ------------------------------
#;; PAETH filter, 1 byte per pixel
#;; ------------------------------
paeth_filter_1bpp:
cmp r1, r0
vld1.8 {d0[0]},[r2] @; load 1 byte (pixel x) from curr
@; row into d0[0]
vld1.8 {d1[0]},[r3]! @; load 1 byte (pixel b) from prev
@; row into d1[0]
@; increment prev row pointer
vadd.i8 d2,d0,d1 @; d2 = x + b = updated pixel x
vst1.8 {d2[0]},[r2]! @; store 1 byte (updated pixel x)
@; increment curr row pointer
beq paeth_filter_DONE
paeth_filter_1bpp_loop:
add r1,r1,r12 @; increment curr row pointer
cmp r1,r0
#;; d1[0] = c (b in the previous loop iteration)
#;; d2[0] = a (x in the previous loop iteration)
vld1.8 {d3[0]},[r3]! @; load 1 byte (pixel b) from prev
@; row into d3[0]
vld1.8 {d0[0]},[r2] @; load 1 byte (pixel x) from curr
@; row into d0[0]
vshll.u8 q4,d1,#1 @; q4 = c<<1 = 2c
vabdl.u8 q3,d2,d1 @; q3 = pb = abs(a - c)
vabdl.u8 q2,d3,d1 @; q2 = pa = abs(b - c)
vaddl.u8 q5,d2,d3 @; q5 = a + b
vabd.u16 q4,q5,q4 @; q4 = pc = abs(a + b - 2c)
vcle.s16 q5,q2,q3 @; q5 = (pa <= pb)
vcle.s16 q6,q2,q4 @; q6 = (pa <= pc)
vand q5,q5,q6 @; q5 = ((pa <= pb) && (pa <= pc))
vcle.s16 q7,q3,q4 @; q7 = (pb <= pc)
vshrn.u16 d10,q5,#8 @; d10 = ((pa <= pb) && (pa <= pc))
vshrn.u16 d14,q7,#8 @; d14 = (pb <= pc)
@
vand d2,d2,d10 @; d2 = a where 1, 0 where 0
vbsl d14,d3,d1 @; d14 = b where 1, c where 0
vmvn d10,d10 @; invert d10
vand d14,d14,d10 @; d14 = b/c where 1, 0 where 0
vadd.i8 d2,d2,d14 @; d2 = p = a/b/c where appropriate
vadd.i8 d2,d2,d0 @; d2 = x + p (updated pixel x)
vmov d1,d3 @; d1 = b (c for next iteration)
vst1.8 {d2[0]},[r2]! @; store 1 byte (updated pixel x)
bne paeth_filter_1bpp_loop
b paeth_filter_DONE @; exit loop when
@; loop counter == rowbytes
#;; -------------------------------
#;; PAETH filter, 2 bytes per pixel
#;; -------------------------------
paeth_filter_2bpp:
cmp r1, r0
vld1.16 {d0[0]},[r2] @; load 2 bytes (pixel x) from curr
@; row into d0[0]
vld1.16 {d1[0]},[r3]! @; load 2 bytes (pixel b) from prev
@; row into d1[0]
@; increment prev row pointer
vadd.i8 d2,d0,d1 @; d2 = x + b = updated pixel x
vst1.16 {d2[0]},[r2]! @; store 2 bytes (updated pixel x)
@; increment curr row pointer
beq paeth_filter_DONE
paeth_filter_2bpp_loop:
add r1,r1,r12 @; loop counter += bpp
cmp r1,r0
#;; d1[0] = c (b in the previous loop iteration)
#;; d2[0] = a (x in the previous loop iteration)
vld1.16 {d3[0]},[r3]! @; load 2 bytes (pixel b) from prev
@; row into d3[0]
vld1.16 {d0[0]},[r2] @; load 2 bytes (pixel x) from curr
@; row into d0[0]
vshll.u8 q4,d1,#1 @; q4 = c<<1 = 2c
vabdl.u8 q3,d2,d1 @; q3 = pb = abs(a - c)
vabdl.u8 q2,d3,d1 @; q2 = pa = abs(b - c)
vaddl.u8 q5,d2,d3 @; q5 = a + b
vabd.u16 q4,q5,q4 @; q4 = pc = abs(a + b - 2c)
vcle.s16 q5,q2,q3 @; q5 = (pa <= pb)
vcle.s16 q6,q2,q4 @; q6 = (pa <= pc)
vand q5,q5,q6 @; q5 = ((pa <= pb) && (pa <= pc))
vcle.s16 q7,q3,q4 @; q7 = (pb <= pc)
vshrn.u16 d10,q5,#8 @; d10 = ((pa <= pb) && (pa <= pc))
vshrn.u16 d14,q7,#8 @; d14 = (pb <= pc)
vand d2,d2,d10 @; d2 = a where 1, 0 where 0
vbsl d14,d3,d1 @; d14 = b where 1, c where 0
vmvn d10,d10 @; invert d10
vand d14,d14,d10 @; d14 = b/c where 1, 0 where 0
vadd.i8 d2,d2,d14 @; d2 = p = a/b/c where appropriate
vadd.i8 d2,d2,d0 @; d2 = x + p (updated pixel x)
vmov d1,d3 @; d1 = b (c for next iteration)
vst1.16 {d2[0]},[r2]! @; store 2 bytes (updated pixel x)
@; increment curr row pointer
bne paeth_filter_2bpp_loop
b paeth_filter_DONE @; exit loop when
@; loop counter == rowbytes
#;; -------------------------------
#;; PAETH filter, 3 bytes per pixel
#;; -------------------------------
paeth_filter_3bpp:
cmp r1, r0
vld1.32 {d0[0]},[r2] @; load 4 bytes (pixel x + 1 extra
@; byte) from curr row into d0[0]
vld1.32 {d1[0]},[r3],r12 @; load 4 bytes (pixel b + 1 extra
@; byte) from prev row into d1[0]
@; increment prev row pointer
vadd.i8 d2,d0,d1 @; d2 = x + b = updated pixel x
vst1.16 {d2[0]},[r2]! @; store 2 bytes (updated pixel x)
@; increment curr row pointer
vst1.8 {d2[2]},[r2]! @; store 1 byte (updated pixel x)
@; increment curr row pointer
beq paeth_filter_DONE
paeth_filter_3bpp_loop:
add r1,r1,r12 @; loop counter += bpp
cmp r1,r0
#;; d1[0] = c (b in the previous loop iteration)
#;; d2[0] = a (x in the previous loop iteration)
vld1.32 {d3[0]},[r3],r12 @; load 4 bytes (pixel b + 1 extra
@; byte) from prev row into d3[0]
vld1.32 {d0[0]},[r2] @; load 4 bytes (pixel x + 1 extra
@; byte) from curr row into d0[0]
vshll.u8 q4,d1,#1 @; q4 = c<<1 = 2c
vabdl.u8 q3,d2,d1 @; q3 = pb = abs(a - c)
vabdl.u8 q2,d3,d1 @; q2 = pa = abs(b - c)
vaddl.u8 q5,d2,d3 @; q5 = a + b
vabd.u16 q4,q5,q4 @; q4 = pc = abs(a + b - 2c)
@
vcle.s16 q5,q2,q3 @; q5 = (pa <= pb)
vcle.s16 q6,q2,q4 @; q6 = (pa <= pc)
vand q5,q5,q6 @; q5 = ((pa <= pb) && (pa <= pc))
vcle.s16 q7,q3,q4 @; q7 = (pb <= pc)
vshrn.u16 d10,q5,#8 @; d10 = ((pa <= pb) && (pa <= pc))
vshrn.u16 d14,q7,#8 @; d14 = (pb <= pc)
@
vand d2,d2,d10 @; d2 = a where 1, 0 where 0
vbsl d14,d3,d1 @; d14 = b where 1, c where 0
vmvn d10,d10 @; invert d10
vand d14,d14,d10 @; d14 = b/c where 1, 0 where 0
vadd.i8 d2,d2,d14 @; d2 = p = a/b/c where appropriate
vadd.i8 d2,d2,d0 @; d2 = x + p (updated pixel x)
vmov d1,d3 @; d1 = b (c for next iteration)
vst1.16 {d2[0]},[r2]! @; store 2 bytes (updated pixel x)
@; increment curr row pointer
vst1.8 {d2[2]},[r2]! @; store 1 byte (updated pixel x)
@; increment curr row pointer
bne paeth_filter_3bpp_loop
b paeth_filter_DONE @; exit loop when
@; loop counter == rowbytes
#;; -------------------------------
#;; PAETH filter, 4 bytes per pixel
#;; -------------------------------
paeth_filter_4bpp:
cmp r1, r0
vld1.32 {d0[0]},[r2] @; load 4 bytes (pixel x) from curr
@; row into d0[0]
vld1.32 {d1[0]},[r3]! @; load 4 bytes (pixel b) from prev
@; row into d1[0]
@; increment prev row pointer
vadd.i8 d2,d0,d1 @; d2 = x + b = updated pixel x
vst1.32 {d2[0]},[r2]! @; store 4 bytes (updated pixel x)
@; increment curr row pointer
beq paeth_filter_DONE
paeth_filter_4bpp_loop:
add r1,r1,r12 @; loop counter += bpp
cmp r1,r0
#;; d1[0] = c (b in the previous loop iteration)
#;; d2[0] = a (x in the previous loop iteration)
vld1.32 {d3[0]},[r3]! @; load 4 bytes (pixel b) from prev
@; row into d3[0]
vld1.32 {d0[0]},[r2] @; load 4 bytes (pixel x) from curr
@; row into d0[0]
vshll.u8 q4,d1,#1 @; q4 = c<<1 = 2c
vabdl.u8 q3,d2,d1 @; q3 = pb = abs(a - c)
vabdl.u8 q2,d3,d1 @; q2 = pa = abs(b - c)
vaddl.u8 q5,d2,d3 @; q5 = a + b
vabd.u16 q4,q5,q4 @; q4 = pc = abs(a + b - 2c)
@
vcle.s16 q5,q2,q3 @; q5 = (pa <= pb)
vcle.s16 q6,q2,q4 @; q6 = (pa <= pc)
vand q5,q5,q6 @; q5 = ((pa <= pb) && (pa <= pc))
vcle.s16 q7,q3,q4 @; q7 = (pb <= pc)
vshrn.u16 d10,q5,#8 @; d10 = ((pa <= pb) && (pa <= pc))
vshrn.u16 d14,q7,#8 @; d14 = (pb <= pc)
@
vand d2,d2,d10 @; d2 = a where 1, 0 where 0
vbsl d14,d3,d1 @; d14 = b where 1, c where 0
vmvn d10,d10 @; invert d10
vand d14,d14,d10 @; d14 = b/c where 1, 0 where 0
vadd.i8 d2,d2,d14 @; d2 = p = a/b/c where appropriate
vadd.i8 d2,d2,d0 @; d2 = x + p (updated pixel x)
vmov d1,d3 @; d1 = b (c for next iteration)
vst1.32 {d2[0]},[r2]! @; store 4 bytes (updated pixel x)
@; increment curr row pointer
bne paeth_filter_4bpp_loop
b paeth_filter_DONE @; exit loop when
@; loop counter == rowbytes
#;; -------------------------------
#;; PAETH filter, 6 bytes per pixel
#;; -------------------------------
paeth_filter_6bpp:
cmp r1, r0
vld1.8 {d0},[r2] @; load 8 bytes (pixel x + 2 extra
@; bytes) from curr row into d0
vld1.8 {d1},[r3],r12 @; load 8 bytes (pixel b + 2 extra
@; bytes) from prev row into d1
@; increment prev row pointer
vadd.i8 d2,d0,d1 @; d2 = x + b = updated pixel x
vst1.32 {d2[0]},[r2]! @; store 4 bytes (updated pixel x)
@; increment curr row pointer
vst1.16 {d2[2]},[r2]! @; store 2 bytes (updated pixel x)
@; increment curr row pointer
beq paeth_filter_DONE
paeth_filter_6bpp_loop:
add r1,r1,r12 @; loop counter += bpp
cmp r1,r0
#;; d1[0] = c (b in the previous loop iteration)
#;; d2[0] = a (x in the previous loop iteration)
vld1.8 {d3},[r3],r12 @; load 8 bytes (pixel b + 2 extra
@; bytes) from prev row into d3
vld1.8 {d0},[r2] @; load 8 bytes (pixel x + 2 extra
@; bytes) from curr row into d0
vshll.u8 q4,d1,#1 @; q4 = c<<1 = 2c
vabdl.u8 q3,d2,d1 @; q3 = pb = abs(a - c)
vabdl.u8 q2,d3,d1 @; q2 = pa = abs(b - c)
vaddl.u8 q5,d2,d3 @; q5 = a + b
vabd.u16 q4,q5,q4 @; q4 = pc = abs(a + b - 2c)
vcle.s16 q5,q2,q3 @; q5 = (pa <= pb)
vcle.s16 q6,q2,q4 @; q6 = (pa <= pc)
vand q5,q5,q6 @; q5 = ((pa <= pb) && (pa <= pc))
vcle.s16 q7,q3,q4 @; q7 = (pb <= pc)
vshrn.u16 d10,q5,#8 @; d10 = ((pa <= pb) && (pa <= pc))
vshrn.u16 d14,q7,#8 @; d14 = (pb <= pc)
vand d2,d2,d10 @; d2 = a where 1, 0 where 0
vbsl d14,d3,d1 @; d14 = b where 1, c where 0
vmvn d10,d10 @; invert d10
vand d14,d14,d10 @; d14 = b/c where 1, 0 where 0
vadd.i8 d2,d2,d14 @; d2 = p = a/b/c where appropriate
vadd.i8 d2,d2,d0 @; d2 = x + p (updated pixel x)
vmov d1,d3 @; d1 = b (c for next iteration)
vst1.32 {d2[0]},[r2]! @; store 4 bytes (updated pixel x)
@; increment curr row pointer
vst1.16 {d2[2]},[r2]! @; store 2 bytes (updated pixel x)
@; increment curr row pointer
bne paeth_filter_6bpp_loop
b paeth_filter_DONE @; exit loop when
@; loop counter == rowbytes
#;; -------------------------------
#;; PAETH filter, 8 bytes per pixel
#;; -------------------------------
paeth_filter_8bpp:
cmp r1, r0
vld1.8 {d0},[r2] @; load 8 bytes (pixel x) from curr
@; row into d0
vld1.8 {d1},[r3]! @; load 8 bytes (pixel b) from prev
@; row into d1
@; increment prev row pointer
vadd.i8 d2,d0,d1 @; d2 = x + b = updated pixel x
vst1.8 {d2},[r2]! @; store 8 bytes (updated pixel x)
@; increment curr row pointer
beq paeth_filter_DONE
paeth_filter_8bpp_loop:
add r1,r1,r12 @; loop counter += bpp
cmp r1,r0
#;; d1[0] = c (b in the previous loop iteration)
#;; d2[0] = a (x in the previous loop iteration)
vld1.8 {d3},[r3]! @; load 8 bytes (pixel b) from prev
@; row into d3
vld1.8 {d0},[r2] @; load 8 bytes (pixel x) from curr
@; row into d0
vshll.u8 q4,d1,#1 @; q4 = c<<1 = 2c
vabdl.u8 q3,d2,d1 @; q3 = pb = abs(a - c)
vabdl.u8 q2,d3,d1 @; q2 = pa = abs(b - c)
vaddl.u8 q5,d2,d3 @; q5 = a + b
vabd.u16 q4,q5,q4 @; q4 = pc = abs(a + b - 2c)
@
vcle.s16 q5,q2,q3 @; q5 = (pa <= pb)
vcle.s16 q6,q2,q4 @; q6 = (pa <= pc)
vand q5,q5,q6 @; q5 = ((pa <= pb) && (pa <= pc))
vcle.s16 q7,q3,q4 @; q7 = (pb <= pc)
vshrn.u16 d10,q5,#8 @; d10 = ((pa <= pb) && (pa <= pc))
vshrn.u16 d14,q7,#8 @; d14 = (pb <= pc)
@
vand d2,d2,d10 @; d2 = a where 1, 0 where 0
vbsl d14,d3,d1 @; d14 = b where 1, c where 0
vmvn d10,d10 @; invert d10
vand d14,d14,d10 @; d14 = b/c where 1, 0 where 0
vadd.i8 d2,d2,d14 @; d2 = p = a/b/c where appropriate
vadd.i8 d2,d2,d0 @; d2 = x + p (updated pixel x)
vmov d1,d3 @; d1 = b (c for next iteration)
vst1.8 {d2},[r2]! @; store 8 bytes (updated pixel x)
@; increment curr row pointer
bne paeth_filter_8bpp_loop
b paeth_filter_DONE @; exit loop when
@; loop counter == rowbytes
paeth_filter_DONE:
VPOP {q4-q7}
bx r14
DONE:
bx r14
.size png_read_filter_row_neon, .-png_read_filter_row_neon
.END