Files
third_party_libpng/libpng_optimize.patch
T
zhwang0 9ac0f66add update patch for libpng
Signed-off-by: zhwang0 <zhwang0@163.com>
2025-09-05 15:14:07 +08:00

3408 lines
110 KiB
Diff

diff --git a/arm/arm_init.c b/arm/arm_init.c
index 3a89998ab..05aa2c0d9 100644
--- a/arm/arm_init.c
+++ b/arm/arm_init.c
@@ -113,13 +113,23 @@ png_init_filter_functions_neon(png_structp pp, unsigned int bpp)
* initialization function.)
*/
pp->read_filter[PNG_FILTER_VALUE_UP-1] = png_read_filter_row_up_neon;
-
+#ifdef PNG_MULTY_LINE_ENABLE
+ // OH ISSUE: png optimize
+ pp->read_filter[PNG_FILTER_VALUE_UP_X2-1] = png_read_filter_row_up_x2_neon;
+#endif
if (bpp == 3)
{
pp->read_filter[PNG_FILTER_VALUE_SUB-1] = png_read_filter_row_sub3_neon;
pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg3_neon;
pp->read_filter[PNG_FILTER_VALUE_PAETH-1] =
png_read_filter_row_paeth3_neon;
+#ifdef PNG_MULTY_LINE_ENABLE
+ // OH ISSUE: png optimize
+ pp->read_filter[PNG_FILTER_VALUE_AVG_X2-1] =
+ png_read_filter_row_avg3_x2_neon;
+ pp->read_filter[PNG_FILTER_VALUE_PAETH_X2-1] =
+ png_read_filter_row_paeth3_x2_neon;
+#endif
}
else if (bpp == 4)
@@ -128,6 +138,13 @@ png_init_filter_functions_neon(png_structp pp, unsigned int bpp)
pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg4_neon;
pp->read_filter[PNG_FILTER_VALUE_PAETH-1] =
png_read_filter_row_paeth4_neon;
+#ifdef PNG_MULTY_LINE_ENABLE
+ // OH ISSUE: png optimize
+ pp->read_filter[PNG_FILTER_VALUE_AVG_X2-1] =
+ png_read_filter_row_avg4_x2_neon;
+ pp->read_filter[PNG_FILTER_VALUE_PAETH_X2-1] =
+ png_read_filter_row_paeth4_x2_neon;
+#endif
}
}
#endif /* PNG_ARM_NEON_OPT > 0 */
diff --git a/arm/filter_neon_intrinsics.c b/arm/filter_neon_intrinsics.c
index 4466d48b2..27048a578 100644
--- a/arm/filter_neon_intrinsics.c
+++ b/arm/filter_neon_intrinsics.c
@@ -47,6 +47,7 @@
#if PNG_ARM_NEON_OPT > 0
+#ifndef PNG_MULTY_LINE_ENABLE
void
png_read_filter_row_up_neon(png_row_infop row_info, png_bytep row,
png_const_bytep prev_row)
@@ -396,7 +397,1351 @@ png_read_filter_row_paeth4_neon(png_row_infop row_info, png_bytep row,
vst4_lane_u32(png_ptr(uint32_t,rp), vdest_val, 0);
}
}
+#else
+// OH ISSUE: png optimize
+// according to definition: row_info->rowbytes = row_width * row_info->channels,
+// the input rowbytes must be 3 or 4 times the channel size, so:
+// for RGB neon process 12 bytes at once,the tail must be 3,6,9;
+// for RGBA neon process 16 or 8 bytes at once,the tail must be 4;
+// filter operators are internal function, row_info and row ensure non empty outside.
+#define STEP_RGB (12) // 3 channel RGB process 12 bytes at once
+#define TAIL_RGB3 (9) // tail 3 pixels have 9 bytes
+#define TAIL_RGB2 (6) // tail 2 pixels have 6 bytes
+#define TAIL_RGB1 (3) // tail 1 pixel have 3 bytes
+#define STEP_RGBA (16) // GBA neon process 16 bytes at once
+#define STEP_RGBA_HALF (8) // GBA neon process 8 bytes at once
+#define TAIL_RGBA (4) // tail 1 pixel have 4 bytes
+#define IND3 (3) // index 3
+#define IND2 (2) // index 2
+#define OFFSET3 (3) // RGB offset 3 bytes
+#define OFFSET6 (6) // RGB offset 6 bytes
+void png_read_filter_row_up_neon(png_row_infop row_info, png_bytep row,
+ png_const_bytep prev_row)
+{
+ png_bytep rp = row;
+ png_const_bytep pp = prev_row;
+ int count = row_info->rowbytes;
+
+ png_debug(1, "in png_read_filter_row_up_neon");
+
+ uint8x16_t qrp, qpp;
+ while (count >= STEP_RGBA) {
+ qrp = vld1q_u8(rp);
+ qpp = vld1q_u8(pp);
+ qrp = vaddq_u8(qrp, qpp);
+ vst1q_u8(rp, qrp);
+ rp += STEP_RGBA;
+ pp += STEP_RGBA;
+ count -= STEP_RGBA;
+ }
+
+ if (count >= STEP_RGBA_HALF) {
+ uint8x8_t qrp1, qpp1;
+ qrp1 = vld1_u8(rp);
+ qpp1 = vld1_u8(pp);
+ qrp1 = vadd_u8(qrp1, qpp1);
+ vst1_u8(rp, qrp1);
+ rp += STEP_RGBA_HALF;
+ pp += STEP_RGBA_HALF;
+ count -= STEP_RGBA_HALF;
+ }
+
+ for (int i = 0; i < count; i++) {
+ *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
+ rp++;
+ }
+}
+
+void png_read_filter_row_up_x2_neon(png_row_infop row_info, png_bytep row,
+ png_const_bytep prev_row)
+{
+ png_bytep rp = row;
+ png_const_bytep pp = prev_row;
+ int count = row_info->rowbytes;
+ png_bytep np = row + row_info->rowbytes + 1;
+
+ png_debug(1, "in png_read_filter_row_up_x2_neon");
+
+ uint8x16_t qrp, qpp, qnp;
+ while (count >= STEP_RGBA) {
+ qrp = vld1q_u8(rp);
+ qpp = vld1q_u8(pp);
+ qnp = vld1q_u8(np);
+ qrp = vaddq_u8(qrp, qpp);
+ qnp = vaddq_u8(qnp, qrp);
+ vst1q_u8(rp, qrp);
+ vst1q_u8(np, qnp);
+ rp += STEP_RGBA;
+ pp += STEP_RGBA;
+ np += STEP_RGBA;
+ count -= STEP_RGBA;
+ }
+
+ if (count >= STEP_RGBA_HALF) {
+ uint8x8_t qrp1, qpp1, qnp1;
+ qrp1 = vld1_u8(rp);
+ qpp1 = vld1_u8(pp);
+ qnp1 = vld1_u8(np);
+ qrp1 = vadd_u8(qrp1, qpp1);
+ qnp1 = vadd_u8(qnp1, qrp1);
+ vst1_u8(rp, qrp1);
+ vst1_u8(np, qnp1);
+ rp += STEP_RGBA_HALF;
+ pp += STEP_RGBA_HALF;
+ np += STEP_RGBA_HALF;
+ count -= STEP_RGBA_HALF;
+ }
+
+ for (int i = 0; i < count; i++) {
+ *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
+ *np = (png_byte)(((int)(*np) + (int)(*rp++)) & 0xff);
+ np++;
+ }
+}
+
+void png_read_filter_row_sub3_neon(png_row_infop row_info, png_bytep row,
+ png_const_bytep prev_row)
+{
+ png_bytep rp = row;
+ png_bytep rp_stop = row + row_info->rowbytes;
+
+ uint8x16_t vtmp = vld1q_u8(rp);
+ uint8x8x2_t *vrpt = png_ptr(uint8x8x2_t, &vtmp);
+ uint8x8x2_t vrp = *vrpt;
+
+ uint8x8x4_t vdest;
+ vdest.val[IND3] = vdup_n_u8(0);
+
+ uint8x8_t vtmp1, vtmp2;
+ uint32x2_t *temp_pointer;
+
+ png_debug(1, "in png_read_filter_row_sub3_neon");
+
+ size_t tail_bytes = row_info->rowbytes % STEP_RGB;
+ png_byte last_byte = *rp_stop;
+ png_bytep rp_stop_new = rp_stop - tail_bytes;
+ for (; rp < rp_stop_new;)
+ {
+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3);
+ vdest.val[0] = vadd_u8(vdest.val[IND3], vrp.val[0]);
+ vtmp2 = vext_u8(vrp.val[0], vrp.val[1], OFFSET6);
+ vdest.val[1] = vadd_u8(vdest.val[0], vtmp1);
+
+ vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1);
+ vdest.val[IND2] = vadd_u8(vdest.val[1], vtmp2);
+ vdest.val[IND3] = vadd_u8(vdest.val[IND2], vtmp1);
+
+ vtmp = vld1q_u8(rp + STEP_RGB);
+ vrpt = png_ptr(uint8x8x2_t, &vtmp);
+ vrp = *vrpt;
+
+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0);
+ rp += OFFSET3;
+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0);
+ rp += OFFSET3;
+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND2]), 0);
+ rp += OFFSET3;
+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND3]), 0);
+ rp += OFFSET3;
+ }
+
+ if (tail_bytes == TAIL_RGB1) {
+ vdest.val[0] = vadd_u8(vdest.val[IND3], vrp.val[0]);
+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0);
+ } else if (tail_bytes == TAIL_RGB2) {
+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3);
+ vdest.val[0] = vadd_u8(vdest.val[IND3], vrp.val[0]);
+ vdest.val[1] = vadd_u8(vdest.val[0], vtmp1);
+
+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0);
+ rp += OFFSET3;
+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0);
+ } else if (tail_bytes == TAIL_RGB3) {
+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3);
+ vdest.val[0] = vadd_u8(vdest.val[IND3], vrp.val[0]);
+ vtmp2 = vext_u8(vrp.val[0], vrp.val[1], OFFSET6);
+ vdest.val[1] = vadd_u8(vdest.val[0], vtmp1);
+ vdest.val[IND2] = vadd_u8(vdest.val[1], vtmp2);
+
+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0);
+ rp += OFFSET3;
+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0);
+ rp += OFFSET3;
+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND2]), 0);
+ }
+ *rp_stop = last_byte;
+
+ PNG_UNUSED(prev_row)
+}
+
+void png_read_filter_row_sub4_neon(png_row_infop row_info, png_bytep row,
+ png_const_bytep prev_row)
+{
+ png_bytep rp = row;
+ int count = row_info->rowbytes;
+
+ uint8x8x4_t vdest;
+ vdest.val[IND3] = vdup_n_u8(0);
+
+ png_debug(1, "in png_read_filter_row_sub4_neon");
+
+ uint32x2x4_t vtmp;
+ uint8x8x4_t *vrpt;
+ uint8x8x4_t vrp;
+ uint32x2x4_t vdest_val;
+ while (count >= STEP_RGBA) {
+ uint32x2x4_t *temp_pointer;
+ vtmp = vld4_u32(png_ptr(uint32_t, rp));
+ vrpt = png_ptr(uint8x8x4_t, &vtmp);
+ vrp = *vrpt;
+
+ vdest.val[0] = vadd_u8(vdest.val[IND3], vrp.val[0]);
+ vdest.val[1] = vadd_u8(vdest.val[0], vrp.val[1]);
+ vdest.val[IND2] = vadd_u8(vdest.val[1], vrp.val[IND2]);
+ vdest.val[IND3] = vadd_u8(vdest.val[IND2], vrp.val[IND3]);
+
+ vdest_val = png_ldr(uint32x2x4_t, &vdest);
+ vst4_lane_u32(png_ptr(uint32_t, rp), vdest_val, 0);
+
+ rp += STEP_RGBA;
+ count -= STEP_RGBA;
+ }
+
+ if (count >= STEP_RGBA_HALF) {
+ uint32x2x2_t vtmp1 = vld2_u32(png_ptr(uint32_t, rp));
+ uint8x8x2_t *vrpt1 = png_ptr(uint8x8x2_t, &vtmp1);
+ uint8x8x2_t vrp1 = *vrpt1;
+ uint32x2x2_t *temp_pointer;
+ uint32x2x2_t vdest_val1;
+
+ vdest.val[0] = vadd_u8(vdest.val[IND3], vrp1.val[0]);
+ vdest.val[1] = vadd_u8(vdest.val[0], vrp1.val[1]);
+ vdest.val[IND3] = vdest.val[1];
+ vdest_val1 = png_ldr(uint32x2x2_t, &vdest);
+ vst2_lane_u32(png_ptr(uint32_t, rp), vdest_val1, 0);
+
+ rp += STEP_RGBA_HALF;
+ count -= STEP_RGBA_HALF;
+ }
+
+ if (count == 0) {
+ return;
+ }
+
+ uint32x2_t vtmp2 = vld1_u32(png_ptr(uint32_t, rp));
+ uint8x8_t *vrpt2 = png_ptr(uint8x8_t, &vtmp2);
+ uint8x8_t vrp2 = *vrpt2;
+ uint32x2_t *temp_pointer;
+ uint32x2_t vdest_val2;
+
+ vdest.val[0] = vadd_u8(vdest.val[IND3], vrp2);
+ vdest_val2 = png_ldr(uint32x2_t, &vdest);
+ vst1_lane_u32(png_ptr(uint32_t, rp), vdest_val2, 0);
+
+ PNG_UNUSED(prev_row)
+}
+
+void png_read_filter_row_avg3_neon(png_row_infop row_info, png_bytep row,
+ png_const_bytep prev_row)
+{
+ png_bytep rp = row;
+ png_const_bytep pp = prev_row;
+ png_bytep rp_stop = row + row_info->rowbytes;
+
+ uint8x16_t vtmp;
+ uint8x8x2_t *vrpt;
+ uint8x8x2_t vrp;
+ uint8x8x4_t vdest;
+ vdest.val[IND3] = vdup_n_u8(0);
+
+ vtmp = vld1q_u8(rp);
+ vrpt = png_ptr(uint8x8x2_t, &vtmp);
+ vrp = *vrpt;
+
+ png_debug(1, "in png_read_filter_row_avg3_neon");
+
+ uint8x8_t vtmp1, vtmp2, vtmp3;
+ uint8x8x2_t *vppt;
+ uint8x8x2_t vpp;
+ uint32x2_t *temp_pointer;
+
+ size_t tail_bytes = row_info->rowbytes % STEP_RGB;
+ png_byte last_byte = *rp_stop;
+ png_bytep rp_stop_new = rp_stop - tail_bytes;
+ for (; rp < rp_stop_new; pp += STEP_RGB)
+ {
+ vtmp = vld1q_u8(pp);
+ vppt = png_ptr(uint8x8x2_t, &vtmp);
+ vpp = *vppt;
+
+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3);
+ vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp.val[0]);
+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
+
+ vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET3);
+ vtmp3 = vext_u8(vrp.val[0], vrp.val[1], OFFSET6);
+ vdest.val[1] = vhadd_u8(vdest.val[0], vtmp2);
+ vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
+
+ vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET6);
+ vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1);
+
+ vtmp = vld1q_u8(rp + STEP_RGB);
+ vrpt = png_ptr(uint8x8x2_t, &vtmp);
+ vrp = *vrpt;
+
+ vdest.val[IND2] = vhadd_u8(vdest.val[1], vtmp2);
+ vdest.val[IND2] = vadd_u8(vdest.val[IND2], vtmp3);
+
+ vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1);
+
+ vdest.val[IND3] = vhadd_u8(vdest.val[IND2], vtmp2);
+ vdest.val[IND3] = vadd_u8(vdest.val[IND3], vtmp1);
+
+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0);
+ rp += OFFSET3;
+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0);
+ rp += OFFSET3;
+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND2]), 0);
+ rp += OFFSET3;
+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND3]), 0);
+ rp += OFFSET3;
+ }
+
+ vtmp = vld1q_u8(pp);
+ vppt = png_ptr(uint8x8x2_t, &vtmp);
+ vpp = *vppt;
+
+ if (tail_bytes == TAIL_RGB1) {
+ vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp.val[0]);
+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0);
+ } else if (tail_bytes == TAIL_RGB2) {
+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3);
+ vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp.val[0]);
+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
+
+ vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET3);
+ vdest.val[1] = vhadd_u8(vdest.val[0], vtmp2);
+ vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
+
+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0);
+ rp += OFFSET3;
+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0);
+ } else if (tail_bytes == TAIL_RGB3) {
+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3);
+ vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp.val[0]);
+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
+
+ vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET3);
+ vtmp3 = vext_u8(vrp.val[0], vrp.val[1], OFFSET6);
+ vdest.val[1] = vhadd_u8(vdest.val[0], vtmp2);
+ vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
+
+ vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET6);
+
+ vdest.val[IND2] = vhadd_u8(vdest.val[1], vtmp2);
+ vdest.val[IND2] = vadd_u8(vdest.val[IND2], vtmp3);
+
+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0);
+ rp += OFFSET3;
+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0);
+ rp += OFFSET3;
+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND2]), 0);
+ }
+ *rp_stop = last_byte;
+}
+
+void png_read_filter_row_avg3_x2_neon(png_row_infop row_info, png_bytep row,
+ png_const_bytep prev_row)
+{
+ png_bytep rp = row;
+ png_const_bytep pp = prev_row;
+ png_bytep rp_stop = row + row_info->rowbytes;
+ png_bytep np = rp_stop + 1;
+
+ uint8x16_t vtmp;
+ uint8x8x2_t *vrpt;
+ uint8x8x2_t vrp;
+ uint8x8x4_t vdest;
+ vdest.val[IND3] = vdup_n_u8(0);
+
+ vtmp = vld1q_u8(rp);
+ vrpt = png_ptr(uint8x8x2_t, &vtmp);
+ vrp = *vrpt;
+
+ uint8x8x2_t *vnpt;
+ uint8x8x2_t vnp;
+ uint8x8x4_t vdestN;
+ vdestN.val[IND3] = vdup_n_u8(0);
+
+ vtmp = vld1q_u8(np);
+ vnpt = png_ptr(uint8x8x2_t, &vtmp);
+ vnp = *vnpt;
+
+ png_debug(1, "in png_read_filter_row_x2_avg3_neon");
+
+ uint8x8_t vtmp1, vtmp2, vtmp3;
+ uint8x8x2_t *vppt;
+ uint8x8x2_t vpp;
+ uint32x2_t *temp_pointer;
+
+ size_t tail_bytes = row_info->rowbytes % STEP_RGB;
+ png_byte last_byte = *rp_stop;
+ png_byte last_byte_next = *(rp_stop + row_info->rowbytes + 1);
+ png_bytep rp_stop_new = rp_stop - tail_bytes;
+ for (; rp < rp_stop_new; pp += STEP_RGB)
+ {
+ vtmp = vld1q_u8(pp);
+ vppt = png_ptr(uint8x8x2_t, &vtmp);
+ vpp = *vppt;
+
+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3);
+ vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp.val[0]);
+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
+
+ vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET3);
+ vtmp3 = vext_u8(vrp.val[0], vrp.val[1], OFFSET6);
+ vdest.val[1] = vhadd_u8(vdest.val[0], vtmp2);
+ vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
+
+ vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET6);
+ vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1);
+
+ vtmp = vld1q_u8(rp + STEP_RGB);
+ vrpt = png_ptr(uint8x8x2_t, &vtmp);
+ vrp = *vrpt;
+
+ vdest.val[IND2] = vhadd_u8(vdest.val[1], vtmp2);
+ vdest.val[IND2] = vadd_u8(vdest.val[IND2], vtmp3);
+
+ vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1);
+
+ vdest.val[IND3] = vhadd_u8(vdest.val[IND2], vtmp2);
+ vdest.val[IND3] = vadd_u8(vdest.val[IND3], vtmp1);
+
+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0);
+ rp += OFFSET3;
+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0);
+ rp += OFFSET3;
+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND2]), 0);
+ rp += OFFSET3;
+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND3]), 0);
+ rp += OFFSET3;
+
+ vtmp1 = vext_u8(vnp.val[0], vnp.val[1], OFFSET3);
+ vdestN.val[0] = vhadd_u8(vdestN.val[IND3], vdest.val[0]);
+ vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]);
+
+ vtmp3 = vext_u8(vnp.val[0], vnp.val[1], OFFSET6);
+ vdestN.val[1] = vhadd_u8(vdestN.val[0], vdest.val[1]);
+ vdestN.val[1] = vadd_u8(vdestN.val[1], vtmp1);
+
+ vtmp1 = vext_u8(vnp.val[1], vnp.val[1], 1);
+
+ vtmp = vld1q_u8(np + STEP_RGB);
+ vnpt = png_ptr(uint8x8x2_t, &vtmp);
+ vnp = *vnpt;
+
+ vdestN.val[IND2] = vhadd_u8(vdestN.val[1], vdest.val[IND2]);
+ vdestN.val[IND2] = vadd_u8(vdestN.val[IND2], vtmp3);
+
+ vdestN.val[IND3] = vhadd_u8(vdestN.val[IND2], vdest.val[IND3]);
+ vdestN.val[IND3] = vadd_u8(vdestN.val[IND3], vtmp1);
+
+ vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[0]), 0);
+ np += OFFSET3;
+ vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[1]), 0);
+ np += OFFSET3;
+ vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[IND2]), 0);
+ np += OFFSET3;
+ vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[IND3]), 0);
+ np += OFFSET3;
+ }
+
+ vtmp = vld1q_u8(pp);
+ vppt = png_ptr(uint8x8x2_t, &vtmp);
+ vpp = *vppt;
+
+ if (tail_bytes == TAIL_RGB1) {
+ vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp.val[0]);
+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0);
+
+ vdestN.val[0] = vhadd_u8(vdestN.val[IND3], vdest.val[0]);
+ vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]);
+ vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[0]), 0);
+ } else if (tail_bytes == TAIL_RGB2) {
+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3);
+ vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp.val[0]);
+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
+
+ vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET3);
+ vdest.val[1] = vhadd_u8(vdest.val[0], vtmp2);
+ vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
+
+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0);
+ rp += OFFSET3;
+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0);
+
+ vtmp1 = vext_u8(vnp.val[0], vnp.val[1], OFFSET3);
+ vdestN.val[0] = vhadd_u8(vdestN.val[IND3], vdest.val[0]);
+ vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]);
+
+ vdestN.val[1] = vhadd_u8(vdestN.val[0], vdest.val[1]);
+ vdestN.val[1] = vadd_u8(vdestN.val[1], vtmp1);
+
+ vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[0]), 0);
+ np += OFFSET3;
+ vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[1]), 0);
+ } else if (tail_bytes == TAIL_RGB3) {
+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3);
+ vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp.val[0]);
+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
+
+ vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET3);
+ vtmp3 = vext_u8(vrp.val[0], vrp.val[1], OFFSET6);
+ vdest.val[1] = vhadd_u8(vdest.val[0], vtmp2);
+ vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
+
+ vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET6);
+
+ vdest.val[IND2] = vhadd_u8(vdest.val[1], vtmp2);
+ vdest.val[IND2] = vadd_u8(vdest.val[IND2], vtmp3);
+
+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0);
+ rp += OFFSET3;
+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0);
+ rp += OFFSET3;
+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND2]), 0);
+
+ vtmp1 = vext_u8(vnp.val[0], vnp.val[1], OFFSET3);
+ vdestN.val[0] = vhadd_u8(vdestN.val[IND3], vdest.val[0]);
+ vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]);
+
+ vtmp3 = vext_u8(vnp.val[0], vnp.val[1], OFFSET6);
+ vdestN.val[1] = vhadd_u8(vdestN.val[0], vdest.val[1]);
+ vdestN.val[1] = vadd_u8(vdestN.val[1], vtmp1);
+
+ vdestN.val[IND2] = vhadd_u8(vdestN.val[1], vdest.val[IND2]);
+ vdestN.val[IND2] = vadd_u8(vdestN.val[IND2], vtmp3);
+
+ vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[0]), 0);
+ np += OFFSET3;
+ vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[1]), 0);
+ np += OFFSET3;
+ vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[IND2]), 0);
+ }
+ *rp_stop = last_byte;
+ *(rp_stop + row_info->rowbytes + 1) = last_byte_next;
+}
+
+void png_read_filter_row_avg4_neon(png_row_infop row_info, png_bytep row,
+ png_const_bytep prev_row)
+{
+ png_bytep rp = row;
+ png_const_bytep pp = prev_row;
+ int count = row_info->rowbytes;
+
+ uint8x8x4_t vdest;
+ vdest.val[IND3] = vdup_n_u8(0);
+
+ png_debug(1, "in png_read_filter_row_avg4_neon");
+
+ uint32x2x4_t vtmp;
+ uint8x8x4_t *vrpt, *vppt;
+ uint8x8x4_t vrp, vpp;
+ uint32x2x4_t vdest_val;
+ while (count >= STEP_RGBA) {
+ uint32x2x4_t *temp_pointer;
+ vtmp = vld4_u32(png_ptr(uint32_t, rp));
+ vrpt = png_ptr(uint8x8x4_t, &vtmp);
+ vrp = *vrpt;
+ vtmp = vld4_u32(png_ptrc(uint32_t, pp));
+ vppt = png_ptr(uint8x8x4_t, &vtmp);
+ vpp = *vppt;
+
+ vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp.val[0]);
+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
+ vdest.val[1] = vhadd_u8(vdest.val[0], vpp.val[1]);
+ vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]);
+ vdest.val[IND2] = vhadd_u8(vdest.val[1], vpp.val[IND2]);
+ vdest.val[IND2] = vadd_u8(vdest.val[IND2], vrp.val[IND2]);
+ vdest.val[IND3] = vhadd_u8(vdest.val[IND2], vpp.val[IND3]);
+ vdest.val[IND3] = vadd_u8(vdest.val[IND3], vrp.val[IND3]);
+
+ vdest_val = png_ldr(uint32x2x4_t, &vdest);
+ vst4_lane_u32(png_ptr(uint32_t, rp), vdest_val, 0);
+
+ rp += STEP_RGBA;
+ pp += STEP_RGBA;
+ count -= STEP_RGBA;
+ }
+
+ if (count >= STEP_RGBA_HALF) {
+ uint32x2x2_t vtmp1;
+ uint8x8x2_t *vrpt1, *vppt1;
+ uint8x8x2_t vrp1, vpp1;
+ uint32x2x2_t *temp_pointer;
+ uint32x2x2_t vdest_val1;
+
+ vtmp1 = vld2_u32(png_ptr(uint32_t, rp));
+ vrpt1 = png_ptr(uint8x8x2_t, &vtmp1);
+ vrp1 = *vrpt1;
+ vtmp1 = vld2_u32(png_ptrc(uint32_t, pp));
+ vppt1 = png_ptr(uint8x8x2_t, &vtmp1);
+ vpp1 = *vppt1;
+
+ vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp1.val[0]);
+ vdest.val[0] = vadd_u8(vdest.val[0], vrp1.val[0]);
+ vdest.val[1] = vhadd_u8(vdest.val[0], vpp1.val[1]);
+ vdest.val[1] = vadd_u8(vdest.val[1], vrp1.val[1]);
+ vdest.val[IND3] = vdest.val[1];
+ vdest_val1 = png_ldr(uint32x2x2_t, &vdest);
+ vst2_lane_u32(png_ptr(uint32_t, rp), vdest_val1, 0);
+
+ rp += STEP_RGBA_HALF;
+ pp += STEP_RGBA_HALF;
+ count -= STEP_RGBA_HALF;
+ }
+
+ if (count == 0) {
+ return;
+ }
+
+ uint32x2_t vtmp2;
+ uint8x8_t *vrpt2, *vppt2;
+ uint8x8_t vrp2, vpp2;
+ uint32x2_t *temp_pointer;
+ uint32x2_t vdest_val2;
+
+ vtmp2 = vld1_u32(png_ptr(uint32_t, rp));
+ vrpt2 = png_ptr(uint8x8_t, &vtmp2);
+ vrp2 = *vrpt2;
+ vtmp2 = vld1_u32(png_ptrc(uint32_t, pp));
+ vppt2 = png_ptr(uint8x8_t, &vtmp2);
+ vpp2 = *vppt2;
+
+ vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp2);
+ vdest.val[0] = vadd_u8(vdest.val[0], vrp2);
+
+ vdest_val2 = png_ldr(uint32x2_t, &vdest);
+ vst1_lane_u32(png_ptr(uint32_t, rp), vdest_val2, 0);
+}
+void png_read_filter_row_avg4_x2_neon(png_row_infop row_info, png_bytep row,
+ png_const_bytep prev_row)
+{
+ png_bytep rp = row;
+ png_const_bytep pp = prev_row;
+ int count = row_info->rowbytes;
+ png_bytep np = row + count + 1;
+
+ uint8x8x4_t vdest;
+ vdest.val[IND3] = vdup_n_u8(0);
+
+ png_debug(1, "in png_read_filter_row_avg4_x2_neon");
+
+ uint32x2x4_t vtmp;
+ uint8x8x4_t *vrpt, *vppt;
+ uint8x8x4_t vrp, vpp;
+ uint32x2x4_t vdest_val;
+
+ uint8x8x4_t *vnpt;
+ uint8x8x4_t vnp;
+ uint8x8x4_t vdestN;
+ vdestN.val[IND3] = vdup_n_u8(0);
+
+ while (count >= STEP_RGBA) {
+ uint32x2x4_t *temp_pointer;
+ vtmp = vld4_u32(png_ptr(uint32_t, rp));
+ vrpt = png_ptr(uint8x8x4_t, &vtmp);
+ vrp = *vrpt;
+ vtmp = vld4_u32(png_ptrc(uint32_t, pp));
+ vppt = png_ptr(uint8x8x4_t, &vtmp);
+ vpp = *vppt;
+ vtmp = vld4_u32(png_ptrc(uint32_t, np));
+ vnpt = png_ptr(uint8x8x4_t, &vtmp);
+ vnp = *vnpt;
+
+ vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp.val[0]);
+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
+ vdest.val[1] = vhadd_u8(vdest.val[0], vpp.val[1]);
+ vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]);
+ vdest.val[IND2] = vhadd_u8(vdest.val[1], vpp.val[IND2]);
+ vdest.val[IND2] = vadd_u8(vdest.val[IND2], vrp.val[IND2]);
+ vdest.val[IND3] = vhadd_u8(vdest.val[IND2], vpp.val[IND3]);
+ vdest.val[IND3] = vadd_u8(vdest.val[IND3], vrp.val[IND3]);
+
+ vdest_val = png_ldr(uint32x2x4_t, &vdest);
+ vst4_lane_u32(png_ptr(uint32_t, rp), vdest_val, 0);
+
+ vdestN.val[0] = vhadd_u8(vdestN.val[IND3], vdest.val[0]);
+ vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]);
+ vdestN.val[1] = vhadd_u8(vdestN.val[0], vdest.val[1]);
+ vdestN.val[1] = vadd_u8(vdestN.val[1], vnp.val[1]);
+ vdestN.val[IND2] = vhadd_u8(vdestN.val[1], vdest.val[IND2]);
+ vdestN.val[IND2] = vadd_u8(vdestN.val[IND2], vnp.val[IND2]);
+ vdestN.val[IND3] = vhadd_u8(vdestN.val[IND2], vdest.val[IND3]);
+ vdestN.val[IND3] = vadd_u8(vdestN.val[IND3], vnp.val[IND3]);
+
+ vdest_val = png_ldr(uint32x2x4_t, &vdestN);
+ vst4_lane_u32(png_ptr(uint32_t, np), vdest_val, 0);
+
+ rp += STEP_RGBA;
+ pp += STEP_RGBA;
+ np += STEP_RGBA;
+ count -= STEP_RGBA;
+ }
+
+ if (count >= STEP_RGBA_HALF) {
+ uint32x2x2_t vtmp1;
+ uint8x8x2_t *vrpt1, *vppt1, *vnpt1;
+ uint8x8x2_t vrp1, vpp1, vnp1;
+ uint32x2x2_t *temp_pointer;
+ uint32x2x2_t vdest_val1;
+
+ vtmp1 = vld2_u32(png_ptr(uint32_t, rp));
+ vrpt1 = png_ptr(uint8x8x2_t, &vtmp1);
+ vrp1 = *vrpt1;
+ vtmp1 = vld2_u32(png_ptrc(uint32_t, pp));
+ vppt1 = png_ptr(uint8x8x2_t, &vtmp1);
+ vpp1 = *vppt1;
+ vtmp1 = vld2_u32(png_ptrc(uint32_t, np));
+ vnpt1 = png_ptr(uint8x8x2_t, &vtmp1);
+ vnp1 = *vnpt1;
+
+ vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp1.val[0]);
+ vdest.val[0] = vadd_u8(vdest.val[0], vrp1.val[0]);
+ vdest.val[1] = vhadd_u8(vdest.val[0], vpp1.val[1]);
+ vdest.val[1] = vadd_u8(vdest.val[1], vrp1.val[1]);
+ vdest.val[IND3] = vdest.val[1];
+ vdest_val1 = png_ldr(uint32x2x2_t, &vdest);
+ vst2_lane_u32(png_ptr(uint32_t, rp), vdest_val1, 0);
+
+ vdestN.val[0] = vhadd_u8(vdestN.val[IND3], vdest.val[0]);
+ vdestN.val[0] = vadd_u8(vdestN.val[0], vnp1.val[0]);
+ vdestN.val[1] = vhadd_u8(vdestN.val[0], vdest.val[1]);
+ vdestN.val[1] = vadd_u8(vdestN.val[1], vnp1.val[1]);
+ vdestN.val[IND3] = vdestN.val[1];
+ vdest_val1 = png_ldr(uint32x2x2_t, &vdestN);
+ vst2_lane_u32(png_ptr(uint32_t, np), vdest_val1, 0);
+
+ rp += STEP_RGBA_HALF;
+ pp += STEP_RGBA_HALF;
+ np += STEP_RGBA_HALF;
+ count -= STEP_RGBA_HALF;
+ }
+
+ if (count == 0) {
+ return;
+ }
+
+ uint32x2_t vtmp2;
+ uint8x8_t *vrpt2, *vppt2, *vnpt2;
+ uint8x8_t vrp2, vpp2, vnp2;
+ uint32x2_t *temp_pointer;
+ uint32x2_t vdest_val2;
+
+ vtmp2 = vld1_u32(png_ptr(uint32_t, rp));
+ vrpt2 = png_ptr(uint8x8_t, &vtmp2);
+ vrp2 = *vrpt2;
+ vtmp2 = vld1_u32(png_ptrc(uint32_t, pp));
+ vppt2 = png_ptr(uint8x8_t, &vtmp2);
+ vpp2 = *vppt2;
+ vtmp2 = vld1_u32(png_ptrc(uint32_t, np));
+ vnpt2 = png_ptr(uint8x8_t, &vtmp2);
+ vnp2 = *vnpt2;
+
+ vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp2);
+ vdest.val[0] = vadd_u8(vdest.val[0], vrp2);
+
+ vdest_val2 = png_ldr(uint32x2_t, &vdest);
+ vst1_lane_u32(png_ptr(uint32_t, rp), vdest_val2, 0);
+
+ vdestN.val[0] = vhadd_u8(vdestN.val[IND3], vdest.val[0]);
+ vdestN.val[0] = vadd_u8(vdestN.val[0], vnp2);
+
+ vdest_val2 = png_ldr(uint32x2_t, &vdestN);
+ vst1_lane_u32(png_ptr(uint32_t, np), vdest_val2, 0);
+}
+
+static uint8x8_t paeth(uint8x8_t a, uint8x8_t b, uint8x8_t c)
+{
+ uint8x8_t d, e;
+ uint16x8_t p1, pa, pb, pc;
+
+ p1 = vaddl_u8(a, b); /* a + b */
+ pc = vaddl_u8(c, c); /* c * 2 */
+ pa = vabdl_u8(b, c); /* pa */
+ pb = vabdl_u8(a, c); /* pb */
+ pc = vabdq_u16(p1, pc); /* pc */
+
+ p1 = vcleq_u16(pa, pb); /* pa <= pb */
+ pa = vcleq_u16(pa, pc); /* pa <= pc */
+ pb = vcleq_u16(pb, pc); /* pb <= pc */
+
+ p1 = vandq_u16(p1, pa); /* pa <= pb && pa <= pc */
+
+ d = vmovn_u16(pb);
+ e = vmovn_u16(p1);
+
+ d = vbsl_u8(d, b, c);
+ e = vbsl_u8(e, a, d);
+
+ return e;
+}
+
+void png_read_filter_row_paeth3_neon(png_row_infop row_info, png_bytep row,
+ png_const_bytep prev_row)
+{
+ png_bytep rp = row;
+ png_const_bytep pp = prev_row;
+ png_bytep rp_stop = row + row_info->rowbytes;
+
+ uint8x16_t vtmp;
+ uint8x8x2_t *vrpt;
+ uint8x8x2_t vrp;
+ uint8x8_t vlast = vdup_n_u8(0);
+ uint8x8x4_t vdest;
+ vdest.val[IND3] = vdup_n_u8(0);
+
+ vtmp = vld1q_u8(rp);
+ vrpt = png_ptr(uint8x8x2_t, &vtmp);
+ vrp = *vrpt;
+
+ uint8x8x2_t *vppt;
+ uint8x8x2_t vpp;
+ uint8x8_t vtmp1, vtmp2, vtmp3;
+ uint32x2_t *temp_pointer;
+
+ png_debug(1, "in png_read_filter_row_paeth3_neon");
+
+ size_t tail_bytes = row_info->rowbytes % STEP_RGB;
+ png_byte last_byte = *rp_stop;
+ png_bytep rp_stop_new = rp_stop - tail_bytes;
+ for (; rp < rp_stop_new; pp += STEP_RGB)
+ {
+ vtmp = vld1q_u8(pp);
+ vppt = png_ptr(uint8x8x2_t, &vtmp);
+ vpp = *vppt;
+
+ vdest.val[0] = paeth(vdest.val[IND3], vpp.val[0], vlast);
+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
+
+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3);
+ vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET3);
+ vdest.val[1] = paeth(vdest.val[0], vtmp2, vpp.val[0]);
+ vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
+
+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET6);
+ vtmp3 = vext_u8(vpp.val[0], vpp.val[1], OFFSET6);
+ vdest.val[IND2] = paeth(vdest.val[1], vtmp3, vtmp2);
+ vdest.val[IND2] = vadd_u8(vdest.val[IND2], vtmp1);
+
+ vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1);
+ vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1);
+
+ vtmp = vld1q_u8(rp + STEP_RGB);
+ vrpt = png_ptr(uint8x8x2_t, &vtmp);
+ vrp = *vrpt;
+
+ vdest.val[IND3] = paeth(vdest.val[IND2], vtmp2, vtmp3);
+ vdest.val[IND3] = vadd_u8(vdest.val[IND3], vtmp1);
+
+ vlast = vtmp2;
+
+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0);
+ rp += OFFSET3;
+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0);
+ rp += OFFSET3;
+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND2]), 0);
+ rp += OFFSET3;
+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND3]), 0);
+ rp += OFFSET3;
+ }
+
+ vtmp = vld1q_u8(pp);
+ vppt = png_ptr(uint8x8x2_t, &vtmp);
+ vpp = *vppt;
+
+ if (tail_bytes == TAIL_RGB1) {
+ vdest.val[0] = paeth(vdest.val[IND3], vpp.val[0], vlast);
+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0);
+ } else if (tail_bytes == TAIL_RGB2) {
+ vdest.val[0] = paeth(vdest.val[IND3], vpp.val[0], vlast);
+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
+
+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3);
+ vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET3);
+ vdest.val[1] = paeth(vdest.val[0], vtmp2, vpp.val[0]);
+ vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
+
+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0);
+ rp += OFFSET3;
+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0);
+ } else if (tail_bytes == TAIL_RGB3) {
+ vdest.val[0] = paeth(vdest.val[IND3], vpp.val[0], vlast);
+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
+
+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3);
+ vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET3);
+ vdest.val[1] = paeth(vdest.val[0], vtmp2, vpp.val[0]);
+ vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
+
+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET6);
+ vtmp3 = vext_u8(vpp.val[0], vpp.val[1], OFFSET6);
+ vdest.val[IND2] = paeth(vdest.val[1], vtmp3, vtmp2);
+ vdest.val[IND2] = vadd_u8(vdest.val[IND2], vtmp1);
+
+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0);
+ rp += OFFSET3;
+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0);
+ rp += OFFSET3;
+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND2]), 0);
+ }
+ *rp_stop = last_byte;
+}
+
+void png_read_filter_row_paeth3_x2_neon(png_row_infop row_info, png_bytep row,
+ png_const_bytep prev_row)
+{
+ png_bytep rp = row;
+ png_const_bytep pp = prev_row;
+ png_bytep rp_stop = row + row_info->rowbytes;
+ png_bytep np = rp_stop + 1;
+
+ uint8x16_t vtmp;
+ uint8x8x2_t *vrpt;
+ uint8x8x2_t vrp;
+ uint8x8_t vlast = vdup_n_u8(0);
+ uint8x8x4_t vdest;
+ vdest.val[IND3] = vdup_n_u8(0);
+
+ vtmp = vld1q_u8(rp);
+ vrpt = png_ptr(uint8x8x2_t, &vtmp);
+ vrp = *vrpt;
+
+ uint8x8x2_t *vppt;
+ uint8x8x2_t vpp;
+ uint8x8_t vtmp1, vtmp2, vtmp3;
+ uint32x2_t *temp_pointer;
+
+ uint8x8x2_t *vnpt;
+ uint8x8x2_t vnp;
+ uint8x8_t vlastN = vdup_n_u8(0);
+ uint8x8x4_t vdestN;
+ vdestN.val[IND3] = vdup_n_u8(0);
+
+ vtmp = vld1q_u8(np);
+ vnpt = png_ptr(uint8x8x2_t, &vtmp);
+ vnp = *vnpt;
+
+ png_debug(1, "in png_read_filter_row_paeth3_x2_neon");
+
+ size_t tail_bytes = row_info->rowbytes % STEP_RGB;
+ png_byte last_byte = *rp_stop;
+ png_byte last_byte_next = *(rp_stop + row_info->rowbytes + 1);
+ png_bytep rp_stop_new = rp_stop - tail_bytes;
+
+ for (; rp < rp_stop_new; pp += STEP_RGB)
+ {
+ vtmp = vld1q_u8(pp);
+ vppt = png_ptr(uint8x8x2_t, &vtmp);
+ vpp = *vppt;
+
+ vdest.val[0] = paeth(vdest.val[IND3], vpp.val[0], vlast);
+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
+
+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3);
+ vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET3);
+ vdest.val[1] = paeth(vdest.val[0], vtmp2, vpp.val[0]);
+ vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
+
+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET6);
+ vtmp3 = vext_u8(vpp.val[0], vpp.val[1], OFFSET6);
+ vdest.val[IND2] = paeth(vdest.val[1], vtmp3, vtmp2);
+ vdest.val[IND2] = vadd_u8(vdest.val[IND2], vtmp1);
+
+ vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1);
+ vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1);
+
+ vtmp = vld1q_u8(rp + STEP_RGB);
+ vrpt = png_ptr(uint8x8x2_t, &vtmp);
+ vrp = *vrpt;
+
+ vdest.val[IND3] = paeth(vdest.val[IND2], vtmp2, vtmp3);
+ vdest.val[IND3] = vadd_u8(vdest.val[IND3], vtmp1);
+
+ vlast = vtmp2;
+
+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0);
+ rp += OFFSET3;
+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0);
+ rp += OFFSET3;
+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND2]), 0);
+ rp += OFFSET3;
+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND3]), 0);
+ rp += OFFSET3;
+
+ vdestN.val[0] = paeth(vdestN.val[IND3], vdest.val[0], vlastN);
+ vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]);
+
+ vtmp1 = vext_u8(vnp.val[0], vnp.val[1], OFFSET3);
+ vdestN.val[1] = paeth(vdestN.val[0], vdest.val[1], vdest.val[0]);
+ vdestN.val[1] = vadd_u8(vdestN.val[1], vtmp1);
+
+ vtmp1 = vext_u8(vnp.val[0], vnp.val[1], OFFSET6);
+ vdestN.val[IND2] = paeth(vdestN.val[1], vdest.val[IND2], vdest.val[1]);
+ vdestN.val[IND2] = vadd_u8(vdestN.val[IND2], vtmp1);
+
+ vtmp1 = vext_u8(vnp.val[1], vnp.val[1], 1);
+
+ vtmp = vld1q_u8(np + STEP_RGB);
+ vnpt = png_ptr(uint8x8x2_t, &vtmp);
+ vnp = *vnpt;
+
+ vdestN.val[IND3] = paeth(vdestN.val[IND2], vdest.val[IND3], vdest.val[IND2]);
+ vdestN.val[IND3] = vadd_u8(vdestN.val[IND3], vtmp1);
+
+ vlastN = vdest.val[IND3];
+
+ vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[0]), 0);
+ np += OFFSET3;
+ vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[1]), 0);
+ np += OFFSET3;
+ vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[IND2]), 0);
+ np += OFFSET3;
+ vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[IND3]), 0);
+ np += OFFSET3;
+ }
+
+ vtmp = vld1q_u8(pp);
+ vppt = png_ptr(uint8x8x2_t, &vtmp);
+ vpp = *vppt;
+
+ if (tail_bytes == TAIL_RGB1) {
+ vdest.val[0] = paeth(vdest.val[IND3], vpp.val[0], vlast);
+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0);
+
+ vdestN.val[0] = paeth(vdestN.val[IND3], vdest.val[0], vlastN);
+ vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]);
+ vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[0]), 0);
+ } else if (tail_bytes == TAIL_RGB2) {
+ vdest.val[0] = paeth(vdest.val[IND3], vpp.val[0], vlast);
+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
+
+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3);
+ vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET3);
+ vdest.val[1] = paeth(vdest.val[0], vtmp2, vpp.val[0]);
+ vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
+
+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0);
+ rp += OFFSET3;
+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0);
+
+ vdestN.val[0] = paeth(vdestN.val[IND3], vdest.val[0], vlastN);
+ vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]);
+
+ vtmp1 = vext_u8(vnp.val[0], vnp.val[1], OFFSET3);
+ vdestN.val[1] = paeth(vdestN.val[0], vdest.val[1], vdest.val[0]);
+ vdestN.val[1] = vadd_u8(vdestN.val[1], vtmp1);
+
+ vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[0]), 0);
+ np += OFFSET3;
+ vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[1]), 0);
+ } else if (tail_bytes == TAIL_RGB3) {
+ vdest.val[0] = paeth(vdest.val[IND3], vpp.val[0], vlast);
+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
+
+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3);
+ vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET3);
+ vdest.val[1] = paeth(vdest.val[0], vtmp2, vpp.val[0]);
+ vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
+
+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET6);
+ vtmp3 = vext_u8(vpp.val[0], vpp.val[1], OFFSET6);
+ vdest.val[IND2] = paeth(vdest.val[1], vtmp3, vtmp2);
+ vdest.val[IND2] = vadd_u8(vdest.val[IND2], vtmp1);
+
+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0);
+ rp += OFFSET3;
+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0);
+ rp += OFFSET3;
+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND2]), 0);
+
+ vdestN.val[0] = paeth(vdestN.val[IND3], vdest.val[0], vlastN);
+ vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]);
+
+ vtmp1 = vext_u8(vnp.val[0], vnp.val[1], OFFSET3);
+ vdestN.val[1] = paeth(vdestN.val[0], vdest.val[1], vdest.val[0]);
+ vdestN.val[1] = vadd_u8(vdestN.val[1], vtmp1);
+
+ vtmp1 = vext_u8(vnp.val[0], vnp.val[1], OFFSET6);
+ vdestN.val[IND2] = paeth(vdestN.val[1], vdest.val[IND2], vdest.val[1]);
+ vdestN.val[IND2] = vadd_u8(vdestN.val[IND2], vtmp1);
+
+ vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[0]), 0);
+ np += OFFSET3;
+ vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[1]), 0);
+ np += OFFSET3;
+ vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[IND2]), 0);
+ }
+ *rp_stop = last_byte;
+ *(rp_stop + row_info->rowbytes + 1) = last_byte_next;
+}
+
+void png_read_filter_row_paeth4_neon(png_row_infop row_info, png_bytep row,
+ png_const_bytep prev_row)
+{
+ png_bytep rp = row;
+ int count = row_info->rowbytes;
+ png_const_bytep pp = prev_row;
+
+ uint8x8_t vlast = vdup_n_u8(0);
+ uint8x8x4_t vdest;
+ vdest.val[IND3] = vdup_n_u8(0);
+
+ png_debug(1, "in png_read_filter_row_paeth4_neon");
+
+ uint32x2x4_t vtmp;
+ uint8x8x4_t *vrpt, *vppt;
+ uint8x8x4_t vrp, vpp;
+ uint32x2x4_t vdest_val;
+ while (count >= STEP_RGBA) {
+ uint32x2x4_t *temp_pointer;
+ vtmp = vld4_u32(png_ptr(uint32_t, rp));
+ vrpt = png_ptr(uint8x8x4_t, &vtmp);
+ vrp = *vrpt;
+ vtmp = vld4_u32(png_ptrc(uint32_t, pp));
+ vppt = png_ptr(uint8x8x4_t, &vtmp);
+ vpp = *vppt;
+
+ vdest.val[0] = paeth(vdest.val[IND3], vpp.val[0], vlast);
+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
+ vdest.val[1] = paeth(vdest.val[0], vpp.val[1], vpp.val[0]);
+ vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]);
+ vdest.val[IND2] = paeth(vdest.val[1], vpp.val[IND2], vpp.val[1]);
+ vdest.val[IND2] = vadd_u8(vdest.val[IND2], vrp.val[IND2]);
+ vdest.val[IND3] = paeth(vdest.val[IND2], vpp.val[IND3], vpp.val[IND2]);
+ vdest.val[IND3] = vadd_u8(vdest.val[IND3], vrp.val[IND3]);
+
+ vlast = vpp.val[IND3];
+
+ vdest_val = png_ldr(uint32x2x4_t, &vdest);
+ vst4_lane_u32(png_ptr(uint32_t, rp), vdest_val, 0);
+
+ rp += STEP_RGBA;
+ pp += STEP_RGBA;
+ count -= STEP_RGBA;
+ }
+
+ if (count >= STEP_RGBA_HALF) {
+ uint32x2x2_t vtmp1;
+ uint8x8x2_t *vrpt1, *vppt1;
+ uint8x8x2_t vrp1, vpp1;
+ uint32x2x2_t *temp_pointer;
+ uint32x2x2_t vdest_val1;
+
+ vtmp1 = vld2_u32(png_ptr(uint32_t, rp));
+ vrpt1 = png_ptr(uint8x8x2_t, &vtmp1);
+ vrp1 = *vrpt1;
+ vtmp1 = vld2_u32(png_ptrc(uint32_t, pp));
+ vppt1 = png_ptr(uint8x8x2_t, &vtmp1);
+ vpp1 = *vppt1;
+
+ vdest.val[0] = paeth(vdest.val[IND3], vpp1.val[0], vlast);
+ vdest.val[0] = vadd_u8(vdest.val[0], vrp1.val[0]);
+ vdest.val[1] = paeth(vdest.val[0], vpp1.val[1], vpp1.val[0]);
+ vdest.val[1] = vadd_u8(vdest.val[1], vrp1.val[1]);
+ vlast = vpp1.val[1];
+
+ vdest_val1 = png_ldr(uint32x2x2_t, &vdest);
+ vst2_lane_u32(png_ptr(uint32_t, rp), vdest_val1, 0);
+ vdest.val[IND3] = vdest.val[1];
+
+ rp += STEP_RGBA_HALF;
+ pp += STEP_RGBA_HALF;
+ count -= STEP_RGBA_HALF;
+ }
+
+ if (count == 0) {
+ return;
+ }
+
+ uint32x2_t vtmp2;
+ uint8x8_t *vrpt2, *vppt2;
+ uint8x8_t vrp2, vpp2;
+ uint32x2_t *temp_pointer;
+ uint32x2_t vdest_val2;
+
+ vtmp2 = vld1_u32(png_ptr(uint32_t, rp));
+ vrpt2 = png_ptr(uint8x8_t, &vtmp2);
+ vrp2 = *vrpt2;
+ vtmp2 = vld1_u32(png_ptrc(uint32_t, pp));
+ vppt2 = png_ptr(uint8x8_t, &vtmp2);
+ vpp2 = *vppt2;
+
+ vdest.val[0] = paeth(vdest.val[IND3], vpp2, vlast);
+ vdest.val[0] = vadd_u8(vdest.val[0], vrp2);
+
+ vdest_val2 = png_ldr(uint32x2_t, &vdest);
+ vst1_lane_u32(png_ptr(uint32_t, rp), vdest_val2, 0);
+}
+
+void png_read_filter_row_paeth4_x2_neon(png_row_infop row_info, png_bytep row,
+ png_const_bytep prev_row)
+{
+ png_bytep rp = row;
+ int count = row_info->rowbytes;
+ png_const_bytep pp = prev_row;
+ png_bytep np = row + row_info->rowbytes + 1;
+
+ uint8x8_t vlast = vdup_n_u8(0);
+ uint8x8x4_t vdest;
+ vdest.val[IND3] = vdup_n_u8(0);
+
+ png_debug(1, "in png_read_filter_row_paeth4_x2_neon");
+
+ uint32x2x4_t vtmp;
+ uint8x8x4_t *vrpt, *vppt;
+ uint8x8x4_t vrp, vpp;
+ uint32x2x4_t vdest_val;
+
+ uint8x8x4_t *vnpt;
+ uint8x8x4_t vnp;
+ uint8x8_t vlastN = vdup_n_u8(0);
+ uint8x8x4_t vdestN;
+ vdestN.val[IND3] = vdup_n_u8(0);
+
+ while (count >= STEP_RGBA) {
+ uint32x2x4_t *temp_pointer;
+ vtmp = vld4_u32(png_ptr(uint32_t, rp));
+ vrpt = png_ptr(uint8x8x4_t, &vtmp);
+ vrp = *vrpt;
+ vtmp = vld4_u32(png_ptrc(uint32_t, pp));
+ vppt = png_ptr(uint8x8x4_t, &vtmp);
+ vpp = *vppt;
+ vtmp = vld4_u32(png_ptrc(uint32_t, np));
+ vnpt = png_ptr(uint8x8x4_t, &vtmp);
+ vnp = *vnpt;
+
+ vdest.val[0] = paeth(vdest.val[IND3], vpp.val[0], vlast);
+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
+ vdest.val[1] = paeth(vdest.val[0], vpp.val[1], vpp.val[0]);
+ vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]);
+ vdest.val[IND2] = paeth(vdest.val[1], vpp.val[IND2], vpp.val[1]);
+ vdest.val[IND2] = vadd_u8(vdest.val[IND2], vrp.val[IND2]);
+ vdest.val[IND3] = paeth(vdest.val[IND2], vpp.val[IND3], vpp.val[IND2]);
+ vdest.val[IND3] = vadd_u8(vdest.val[IND3], vrp.val[IND3]);
+
+ vlast = vpp.val[IND3];
+
+ vdest_val = png_ldr(uint32x2x4_t, &vdest);
+ vst4_lane_u32(png_ptr(uint32_t, rp), vdest_val, 0);
+
+ vdestN.val[0] = paeth(vdestN.val[IND3], vdest.val[0], vlastN);
+ vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]);
+ vdestN.val[1] = paeth(vdestN.val[0], vdest.val[1], vdest.val[0]);
+ vdestN.val[1] = vadd_u8(vdestN.val[1], vnp.val[1]);
+ vdestN.val[IND2] = paeth(vdestN.val[1], vdest.val[IND2], vdest.val[1]);
+ vdestN.val[IND2] = vadd_u8(vdestN.val[IND2], vnp.val[IND2]);
+ vdestN.val[IND3] = paeth(vdestN.val[IND2], vdest.val[IND3], vdest.val[IND2]);
+ vdestN.val[IND3] = vadd_u8(vdestN.val[IND3], vnp.val[IND3]);
+
+ vlastN = vdest.val[IND3];
+
+ vdest_val = png_ldr(uint32x2x4_t, &vdestN);
+ vst4_lane_u32(png_ptr(uint32_t, np), vdest_val, 0);
+
+ rp += STEP_RGBA;
+ pp += STEP_RGBA;
+ np += STEP_RGBA;
+ count -= STEP_RGBA;
+ }
+
+ if (count >= STEP_RGBA_HALF) {
+ uint32x2x2_t vtmp1;
+ uint8x8x2_t *vrpt1, *vppt1, *vnpt1;
+ uint8x8x2_t vrp1, vpp1, vnp1;
+ uint32x2x2_t *temp_pointer;
+ uint32x2x2_t vdest_val1;
+
+ vtmp1 = vld2_u32(png_ptr(uint32_t, rp));
+ vrpt1 = png_ptr(uint8x8x2_t, &vtmp1);
+ vrp1 = *vrpt1;
+ vtmp1 = vld2_u32(png_ptrc(uint32_t, pp));
+ vppt1 = png_ptr(uint8x8x2_t, &vtmp1);
+ vpp1 = *vppt1;
+ vtmp1 = vld2_u32(png_ptrc(uint32_t, np));
+ vnpt1 = png_ptr(uint8x8x2_t, &vtmp1);
+ vnp1 = *vnpt1;
+
+ vdest.val[0] = paeth(vdest.val[IND3], vpp1.val[0], vlast);
+ vdest.val[0] = vadd_u8(vdest.val[0], vrp1.val[0]);
+ vdest.val[1] = paeth(vdest.val[0], vpp1.val[1], vpp1.val[0]);
+ vdest.val[1] = vadd_u8(vdest.val[1], vrp1.val[1]);
+
+ vlast = vpp1.val[1];
+
+ vdest_val1 = png_ldr(uint32x2x2_t, &vdest);
+ vst2_lane_u32(png_ptr(uint32_t, rp), vdest_val1, 0);
+
+ vdest.val[IND3] = vdest.val[1];
+
+ vdestN.val[0] = paeth(vdestN.val[IND3], vdest.val[0], vlastN);
+ vdestN.val[0] = vadd_u8(vdestN.val[0], vnp1.val[0]);
+ vdestN.val[1] = paeth(vdestN.val[0], vdest.val[1], vdest.val[0]);
+ vdestN.val[1] = vadd_u8(vdestN.val[1], vnp1.val[1]);
+
+ vlastN = vdest.val[1];
+
+ vdest_val1 = png_ldr(uint32x2x2_t, &vdestN);
+ vst2_lane_u32(png_ptr(uint32_t, np), vdest_val1, 0);
+
+ vdestN.val[IND3] = vdestN.val[1];
+
+ rp += STEP_RGBA_HALF;
+ pp += STEP_RGBA_HALF;
+ np += STEP_RGBA_HALF;
+ count -= STEP_RGBA_HALF;
+ }
+
+ if (count == 0) {
+ return;
+ }
+
+ uint32x2_t vtmp2;
+ uint8x8_t *vrpt2, *vppt2, *vnpt2;
+ uint8x8_t vrp2, vpp2, vnp2;
+ uint32x2_t *temp_pointer;
+ uint32x2_t vdest_val2;
+
+ vtmp2 = vld1_u32(png_ptr(uint32_t, rp));
+ vrpt2 = png_ptr(uint8x8_t, &vtmp2);
+ vrp2 = *vrpt2;
+ vtmp2 = vld1_u32(png_ptrc(uint32_t, pp));
+ vppt2 = png_ptr(uint8x8_t, &vtmp2);
+ vpp2 = *vppt2;
+ vtmp2 = vld1_u32(png_ptrc(uint32_t, np));
+ vnpt2 = png_ptr(uint8x8_t, &vtmp2);
+ vnp2 = *vnpt2;
+
+ vdest.val[0] = paeth(vdest.val[IND3], vpp2, vlast);
+ vdest.val[0] = vadd_u8(vdest.val[0], vrp2);
+
+ vdest_val2 = png_ldr(uint32x2_t, &vdest);
+ vst1_lane_u32(png_ptr(uint32_t, rp), vdest_val2, 0);
+
+ vdestN.val[0] = paeth(vdestN.val[IND3], vdest.val[0], vlastN);
+ vdestN.val[0] = vadd_u8(vdestN.val[0], vnp2);
+
+ vdest_val2 = png_ldr(uint32x2_t, &vdestN);
+ vst1_lane_u32(png_ptr(uint32_t, np), vdest_val2, 0);
+}
+#endif /* PNG_MULTY_LINE_ENABLE */
#endif /* PNG_ARM_NEON_OPT > 0 */
#endif /* PNG_ARM_NEON_IMPLEMENTATION == 1 (intrinsics) */
#endif /* READ */
diff --git a/pngpread.c b/pngpread.c
index e283627b7..43ec512df 100644
--- a/pngpread.c
+++ b/pngpread.c
@@ -264,9 +264,22 @@ png_push_read_chunk(png_structrp png_ptr, png_inforp info_ptr)
png_ptr->idat_size = png_ptr->push_length;
png_ptr->process_mode = PNG_READ_IDAT_MODE;
png_push_have_info(png_ptr, info_ptr);
- png_ptr->zstream.avail_out =
- (uInt) PNG_ROWBYTES(png_ptr->pixel_depth,
- png_ptr->iwidth) + 1;
+#ifdef PNG_MULTY_LINE_ENABLE
+ // OH ISSUE: png optimize
+ if (png_ptr->interlaced == 0 && png_ptr->bit_depth == 8 &&
+ (png_ptr->transformations & PNG_CHECK) == 0) {
+ int rest = png_ptr->num_rows - png_ptr->row_number;
+ int row_num = rest < PNG_INFLATE_ROWS ? rest : PNG_INFLATE_ROWS;
+ png_ptr->zstream.avail_out = (uInt)(PNG_ROWBYTES(png_ptr->pixel_depth,
+ png_ptr->iwidth) + 1) * row_num;
+ }
+ else
+#endif
+ {
+ png_ptr->zstream.avail_out =
+ (uInt) PNG_ROWBYTES(png_ptr->pixel_depth,
+ png_ptr->iwidth) + 1;
+ }
png_ptr->zstream.next_out = png_ptr->row_buf;
return;
}
@@ -623,6 +636,92 @@ png_push_read_IDAT(png_structrp png_ptr)
}
}
+#ifdef PNG_MULTY_LINE_ENABLE
+// OH ISSUE: png optimize
+static void png_push_process_row_x2(png_structrp png_ptr,
+ png_row_info row_info_in)
+{
+ png_debug(1, "in png_push_process_row_x2");
+ png_row_info row_info = row_info_in;
+ png_read_filter_row(png_ptr, &row_info, png_ptr->row_buf + 1,
+ png_ptr->prev_row + 1, png_ptr->row_buf[0] + 4);
+
+#ifdef PNG_READ_TRANSFORMS_SUPPORTED
+ if (png_ptr->transformations != 0)
+ png_do_read_transformations(png_ptr, &row_info);
+#endif
+
+ if (png_ptr->transformed_pixel_depth == 0)
+ {
+ png_ptr->transformed_pixel_depth = row_info.pixel_depth;
+ if (row_info.pixel_depth > png_ptr->maximum_pixel_depth)
+ png_error(png_ptr, "progressive row overflow");
+ }
+
+ png_push_have_row(png_ptr, png_ptr->row_buf + 1);
+ png_read_push_finish_row(png_ptr);
+
+ png_ptr->row_buf = png_ptr->row_buf + png_ptr->rowbytes + 1;
+
+ // do it again
+ if (png_ptr->transformations != 0)
+ {
+ memcpy(png_ptr->prev_row, png_ptr->row_buf, row_info.rowbytes + 1);
+ }
+ else
+ {
+ png_ptr->prev_row = png_ptr->row_buf;
+ }
+#ifdef PNG_READ_TRANSFORMS_SUPPORTED
+ if (png_ptr->transformations != 0)
+ png_do_read_transformations(png_ptr, &row_info);
+#endif
+
+ png_push_have_row(png_ptr, png_ptr->row_buf + 1);
+ png_read_push_finish_row(png_ptr);
+}
+
+static void png_push_process_multi_rows(png_structrp png_ptr, int row_num)
+{
+ png_debug(1, "in png_push_process_multi_rows");
+ uInt row_bytes = png_ptr->rowbytes + 1;
+
+ png_row_info row_info;
+ row_info.width = png_ptr->iwidth;
+ row_info.color_type = png_ptr->color_type;
+ row_info.bit_depth = png_ptr->bit_depth;
+ row_info.channels = png_ptr->channels;
+ row_info.pixel_depth = png_ptr->pixel_depth;
+ row_info.rowbytes = png_ptr->rowbytes;
+
+ png_bytep temp_row = png_ptr->row_buf;
+ png_bytep temp_prev_row = png_ptr->prev_row;
+
+ for (int i = 0; i < row_num; i++) {
+ // check if the x2_filter is effective: only supports channels 3 or 4
+ if ((png_ptr->channels == 3 || png_ptr->channels == 4) &&
+ i < row_num -1 && png_ptr->row_buf[0] > PNG_FILTER_VALUE_SUB &&
+ png_ptr->row_buf[0] < PNG_FILTER_VALUE_LAST &&
+ png_ptr->row_buf[0] == png_ptr->row_buf[row_bytes])
+ {
+ png_push_process_row_x2(png_ptr, row_info);
+ png_ptr->row_buf = png_ptr->row_buf + row_bytes;
+ i++;
+ continue;
+ }
+ png_push_process_row(png_ptr);
+ png_ptr->row_buf = png_ptr->row_buf + row_bytes;
+ }
+
+ if (png_ptr->transformations == 0 && png_ptr->interlaced == 0)
+ {
+ png_ptr->prev_row = temp_prev_row;
+ memcpy(png_ptr->prev_row, png_ptr->row_buf - row_bytes, row_bytes);
+ }
+ png_ptr->row_buf = temp_row;
+}
+#endif
+
void /* PRIVATE */
png_process_IDAT_data(png_structrp png_ptr, png_bytep buffer,
size_t buffer_length)
@@ -639,6 +738,17 @@ png_process_IDAT_data(png_structrp png_ptr, png_bytep buffer,
/* TODO: WARNING: TRUNCATION ERROR: DANGER WILL ROBINSON: */
png_ptr->zstream.avail_in = (uInt)buffer_length;
+#ifdef PNG_MULTY_LINE_ENABLE
+ // OH ISSUE: png optimize
+ int row_num = 1;
+ if (png_ptr->interlaced == 0 && png_ptr->bit_depth == 8 &&
+ (png_ptr->transformations & PNG_CHECK) == 0)
+ {
+ int rest = png_ptr->num_rows - png_ptr->row_number;
+ row_num = rest < PNG_INFLATE_ROWS ? rest : PNG_INFLATE_ROWS;
+ }
+#endif
+
/* Keep going until the decompressed data is all processed
* or the stream marked as finished.
*/
@@ -655,9 +765,20 @@ png_process_IDAT_data(png_structrp png_ptr, png_bytep buffer,
if (!(png_ptr->zstream.avail_out > 0))
{
/* TODO: WARNING: TRUNCATION ERROR: DANGER WILL ROBINSON: */
+#ifdef PNG_MULTY_LINE_ENABLE
+ // OH ISSUE: png optimize
+ if (png_ptr->interlaced == 0 && png_ptr->bit_depth == 8 &&
+ (png_ptr->transformations & PNG_CHECK) == 0)
+ {
+ int rest = png_ptr->num_rows - png_ptr->row_number;
+ row_num = rest < PNG_INFLATE_ROWS ? rest : PNG_INFLATE_ROWS;
+ }
+ png_ptr->zstream.avail_out = (uInt)(PNG_ROWBYTES(png_ptr->pixel_depth,
+ png_ptr->iwidth) + 1) * row_num;
+#else
png_ptr->zstream.avail_out = (uInt)(PNG_ROWBYTES(png_ptr->pixel_depth,
png_ptr->iwidth) + 1);
-
+#endif
png_ptr->zstream.next_out = png_ptr->row_buf;
}
@@ -719,7 +840,12 @@ png_process_IDAT_data(png_structrp png_ptr, png_bytep buffer,
/* Do we have a complete row? */
if (png_ptr->zstream.avail_out == 0)
+#ifdef PNG_MULTY_LINE_ENABLE
+ // OH ISSUE: png optimize
+ png_push_process_multi_rows(png_ptr, row_num);
+#else
png_push_process_row(png_ptr);
+#endif
}
/* And check for the end of the stream. */
@@ -738,6 +864,7 @@ png_process_IDAT_data(png_structrp png_ptr, png_bytep buffer,
void /* PRIVATE */
png_push_process_row(png_structrp png_ptr)
{
+ png_debug(1, "in png_push_process_row");
/* 1.5.6: row_info moved out of png_struct to a local here. */
png_row_info row_info;
@@ -762,8 +889,17 @@ png_push_process_row(png_structrp png_ptr)
* it may not be in the future, so this was changed just to copy the
* interlaced row count:
*/
- memcpy(png_ptr->prev_row, png_ptr->row_buf, row_info.rowbytes + 1);
-
+#ifdef PNG_MULTY_LINE_ENABLE
+ // OH ISSUE: png optimize
+ if (png_ptr->transformations == 0 && png_ptr->interlaced == 0)
+ {
+ png_ptr->prev_row = png_ptr->row_buf;
+ }
+ else
+#endif
+ {
+ memcpy(png_ptr->prev_row, png_ptr->row_buf, row_info.rowbytes + 1);
+ }
#ifdef PNG_READ_TRANSFORMS_SUPPORTED
if (png_ptr->transformations != 0)
png_do_read_transformations(png_ptr, &row_info);
diff --git a/pngpriv.h b/pngpriv.h
index fb521cf00..6027d9acc 100644
--- a/pngpriv.h
+++ b/pngpriv.h
@@ -148,6 +148,20 @@
# define PNG_ARM_NEON_IMPLEMENTATION 0
#endif /* PNG_ARM_NEON_OPT > 0 */
+#if defined(PNG_ARM_NEON_IMPLEMENTATION) && defined(PNG_ARM_NEON)
+// OH ISSUE: png optimize
+# if PNG_ARM_NEON_IMPLEMENTATION == 1
+# define PNG_MULTY_LINE_ENABLE
+# define PNG_WRITE_NEON_ENABLE
+# define PNG_INFLATE_MAX_SIZE (65536)
+# define PNG_INFLATE_ROWS (50)
+# define PNG_CHECK (PNG_EXPAND | PNG_STRIP_ALPHA | PNG_RGB_TO_GRAY | PNG_ENCODE_ALPHA | \
+ PNG_PACKSWAP | PNG_GRAY_TO_RGB | PNG_COMPOSE | PNG_SCALE_16_TO_8 | PNG_16_TO_8 | \
+ PNG_BACKGROUND_EXPAND | PNG_EXPAND_16 | PNG_PACK | PNG_ADD_ALPHA | PNG_EXPAND_tRNS | \
+ PNG_RGB_TO_GRAY_ERR | PNG_RGB_TO_GRAY_WARN | PNG_FILLER | PNG_USER_TRANSFORM)
+# endif
+#endif
+
#ifndef PNG_MIPS_MSA_OPT
# if defined(__mips_msa) && (__mips_isa_rev >= 5) && \
defined(PNG_ALIGNED_MEMORY_SUPPORTED)
@@ -354,8 +368,14 @@
#endif
#ifndef PNG_INTERNAL_FUNCTION
+// OH ISSUE: png optimize
+# ifdef PNG_MULTY_LINE_ENABLE
+# define PNG_HIDE __attribute__((visibility("hidden")))
+# else
+# define PNG_HIDE
+# endif
# define PNG_INTERNAL_FUNCTION(type, name, args, attributes)\
- PNG_LINKAGE_FUNCTION PNG_FUNCTION(type, name, args, PNG_EMPTY attributes)
+ PNG_LINKAGE_FUNCTION PNG_FUNCTION(type, name, args, PNG_HIDE attributes)
#endif
#ifndef PNG_INTERNAL_CALLBACK
@@ -1297,6 +1317,50 @@ PNG_INTERNAL_FUNCTION(void,png_read_filter_row_paeth3_neon,(png_row_infop
row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
PNG_INTERNAL_FUNCTION(void,png_read_filter_row_paeth4_neon,(png_row_infop
row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
+#ifdef PNG_MULTY_LINE_ENABLE
+// OH ISSUE: png optimize
+PNG_INTERNAL_FUNCTION(void, png_read_filter_row_up_x2_neon, (png_row_infop
+ row_info, png_bytep row, png_const_bytep prev_row), PNG_EMPTY);
+PNG_INTERNAL_FUNCTION(void, png_read_filter_row_avg3_x2_neon, (png_row_infop
+ row_info, png_bytep row, png_const_bytep prev_row), PNG_EMPTY);
+PNG_INTERNAL_FUNCTION(void, png_read_filter_row_avg4_x2_neon, (png_row_infop
+ row_info, png_bytep row, png_const_bytep prev_row), PNG_EMPTY);
+PNG_INTERNAL_FUNCTION(void, png_read_filter_row_paeth3_x2_neon, (png_row_infop
+ row_info, png_bytep row, png_const_bytep prev_row), PNG_EMPTY);
+PNG_INTERNAL_FUNCTION(void, png_read_filter_row_paeth4_x2_neon, (png_row_infop
+ row_info, png_bytep row, png_const_bytep prev_row), PNG_EMPTY);
+#endif
+#ifdef PNG_WRITE_NEON_ENABLE
+// OH ISSUE: png optimize
+PNG_INTERNAL_FUNCTION(size_t, png_write_filter_sub3_neon, (png_structrp
+ png_ptr, size_t row_bytes, size_t lmins), PNG_EMPTY);
+PNG_INTERNAL_FUNCTION(void, png_write_filter_sub3_neon_only, (png_structrp
+ png_ptr, size_t row_bytes), PNG_EMPTY);
+PNG_INTERNAL_FUNCTION(size_t, png_write_filter_sub4_neon, (png_structrp
+ png_ptr, size_t row_bytes, size_t lmins), PNG_EMPTY);
+PNG_INTERNAL_FUNCTION(void, png_write_filter_sub4_neon_only, (png_structrp
+ png_ptr, size_t row_bytes), PNG_EMPTY);
+PNG_INTERNAL_FUNCTION(size_t, png_write_filter_up_neon, (png_structrp
+ png_ptr, size_t row_bytes, size_t lmins), PNG_EMPTY);
+PNG_INTERNAL_FUNCTION(void, png_write_filter_up_neon_only, (png_structrp
+ png_ptr, size_t row_bytes), PNG_EMPTY);
+PNG_INTERNAL_FUNCTION(size_t, png_write_filter_avg3_neon, (png_structrp
+ png_ptr, size_t row_bytes, size_t lmins), PNG_EMPTY);
+PNG_INTERNAL_FUNCTION(void, png_write_filter_avg3_neon_only, (png_structrp
+ png_ptr, size_t row_bytes), PNG_EMPTY);
+PNG_INTERNAL_FUNCTION(size_t, png_write_filter_avg4_neon, (png_structrp
+ png_ptr, size_t row_bytes, size_t lmins), PNG_EMPTY);
+PNG_INTERNAL_FUNCTION(void, png_write_filter_avg4_neon_only, (png_structrp
+ png_ptr, size_t row_bytes), PNG_EMPTY);
+PNG_INTERNAL_FUNCTION(size_t, png_write_filter_paeth3_neon, (png_structrp
+ png_ptr, size_t row_bytes, size_t lmins), PNG_EMPTY);
+PNG_INTERNAL_FUNCTION(void, png_write_filter_paeth3_neon_only, (png_structrp
+ png_ptr, size_t row_bytes), PNG_EMPTY);
+PNG_INTERNAL_FUNCTION(size_t, png_write_filter_paeth4_neon, (png_structrp
+ png_ptr, size_t row_bytes, size_t lmins), PNG_EMPTY);
+PNG_INTERNAL_FUNCTION(void, png_write_filter_paeth4_neon_only, (png_structrp
+ png_ptr, size_t row_bytes), PNG_EMPTY);
+#endif
#endif
#if PNG_MIPS_MSA_IMPLEMENTATION == 1
diff --git a/pngread.c b/pngread.c
index 8fa7d9f16..71be1a26c 100644
--- a/pngread.c
+++ b/pngread.c
@@ -54,7 +54,12 @@ png_create_read_struct_2,(png_const_charp user_png_ver, png_voidp error_ptr,
* required (it will be zero in a write structure.)
*/
# ifdef PNG_SEQUENTIAL_READ_SUPPORTED
+#ifdef PNG_MULTY_LINE_ENABLE
+ // OH ISSUE: png optimize
+ png_ptr->IDAT_read_size = PNG_INFLATE_MAX_SIZE;
+#else
png_ptr->IDAT_read_size = PNG_IDAT_READ_SIZE;
+#endif
# endif
# ifdef PNG_BENIGN_READ_ERRORS_SUPPORTED
@@ -684,6 +689,224 @@ png_read_rows(png_structrp png_ptr, png_bytepp row,
#endif /* SEQUENTIAL_READ */
#ifdef PNG_SEQUENTIAL_READ_SUPPORTED
+
+#ifdef PNG_MULTY_LINE_ENABLE
+// OH ISSUE: png optimize
+static void png_read_two_rows(png_structrp png_ptr, png_bytepp rows, png_uint_32 i,
+ png_row_info row_info)
+{
+ png_debug1(1, "in png_read_two_rows %d", png_ptr->row_buf[0]);
+ png_read_filter_row(png_ptr, &row_info, png_ptr->row_buf + 1,
+ png_ptr->prev_row + 1, png_ptr->row_buf[0] + 4);
+
+#ifdef PNG_MNG_FEATURES_SUPPORTED
+ if ((png_ptr->mng_features_permitted & PNG_FLAG_MNG_FILTER_64) != 0 &&
+ (png_ptr->filter_type == PNG_INTRAPIXEL_DIFFERENCING))
+ {
+ /* Intrapixel differencing */
+ png_do_read_intrapixel(&row_info, png_ptr->row_buf + 1);
+ }
+#endif
+
+#ifdef PNG_READ_TRANSFORMS_SUPPORTED
+ if (png_ptr->transformations
+# ifdef PNG_CHECK_FOR_INVALID_INDEX_SUPPORTED
+ || png_ptr->num_palette_max >= 0
+# endif
+ )
+ png_do_read_transformations(png_ptr, &row_info);
+#endif
+
+ /* The transformed pixel depth should match the depth now in row_info. */
+ if (png_ptr->transformed_pixel_depth == 0)
+ {
+ png_ptr->transformed_pixel_depth = row_info.pixel_depth;
+ if (row_info.pixel_depth > png_ptr->maximum_pixel_depth)
+ png_error(png_ptr, "sequential row overflow");
+ }
+
+ else if (png_ptr->transformed_pixel_depth != row_info.pixel_depth)
+ png_error(png_ptr, "internal sequential row size calculation error");
+
+ if (rows[i] != NULL)
+ png_combine_row(png_ptr, rows[i], -1);
+
+ png_read_finish_row(png_ptr);
+
+ if (png_ptr->read_row_fn != NULL)
+ (*(png_ptr->read_row_fn))(png_ptr, png_ptr->row_number, png_ptr->pass);
+
+ png_ptr->row_buf = png_ptr->row_buf + row_info.rowbytes + 1;
+
+ // do again next line
+ memcpy(png_ptr->prev_row, png_ptr->row_buf, row_info.rowbytes + 1);
+
+#ifdef PNG_MNG_FEATURES_SUPPORTED
+ if ((png_ptr->mng_features_permitted & PNG_FLAG_MNG_FILTER_64) != 0 &&
+ (png_ptr->filter_type == PNG_INTRAPIXEL_DIFFERENCING))
+ {
+ /* Intrapixel differencing */
+ png_do_read_intrapixel(&row_info, png_ptr->row_buf + 1);
+ }
+#endif
+
+#ifdef PNG_READ_TRANSFORMS_SUPPORTED
+ if (png_ptr->transformations
+# ifdef PNG_CHECK_FOR_INVALID_INDEX_SUPPORTED
+ || png_ptr->num_palette_max >= 0
+# endif
+ )
+ png_do_read_transformations(png_ptr, &row_info);
+#endif
+
+ /* The transformed pixel depth should match the depth now in row_info. */
+ if (png_ptr->transformed_pixel_depth == 0)
+ {
+ png_ptr->transformed_pixel_depth = row_info.pixel_depth;
+ if (row_info.pixel_depth > png_ptr->maximum_pixel_depth)
+ png_error(png_ptr, "sequential row overflow");
+ }
+
+ else if (png_ptr->transformed_pixel_depth != row_info.pixel_depth)
+ png_error(png_ptr, "internal sequential row size calculation error");
+
+ if (rows[i+1] != NULL)
+ png_combine_row(png_ptr, rows[i+1], -1);
+
+ png_read_finish_row(png_ptr);
+
+ if (png_ptr->read_row_fn != NULL)
+ (*(png_ptr->read_row_fn))(png_ptr, png_ptr->row_number, png_ptr->pass);
+
+ png_ptr->row_buf = png_ptr->row_buf + row_info.rowbytes + 1;
+}
+
+static void png_read_muilty_rows(png_structrp png_ptr, png_bytepp rows,
+ png_uint_32 row_num, png_row_info row_info_in)
+{
+ if (png_ptr == NULL)
+ return;
+
+ png_debug2(1, "in png_read_muilty_rows (row %lu, pass %d)",
+ (unsigned long)png_ptr->row_number, png_ptr->pass);
+
+ if ((png_ptr->mode & PNG_HAVE_IDAT) == 0)
+ png_error(png_ptr, "Invalid attempt to read row data");
+
+ /* Fill the row with IDAT data: */
+ uInt row_bytes = row_info_in.rowbytes;
+ png_ptr->row_buf[0]=255; /* 255 to force error if no data was found */
+ png_read_IDAT_data(png_ptr, png_ptr->row_buf, (row_bytes + 1) * row_num);
+ png_bytep temp_row = png_ptr->row_buf;
+
+ for (png_uint_32 i = 0; i < row_num; i++) {
+ png_row_info row_info = row_info_in;
+ // check if the x2_filter is effective: only supports channels 3 or 4
+ if ((row_info_in.channels == 3 || row_info_in.channels == 4) &&
+ i < row_num -1 && png_ptr->row_buf[0] > PNG_FILTER_VALUE_SUB &&
+ png_ptr->row_buf[0] < PNG_FILTER_VALUE_LAST &&
+ png_ptr->row_buf[0] == png_ptr->row_buf[row_info_in.rowbytes + 1])
+ {
+ png_read_two_rows(png_ptr, rows, i, row_info);
+ i++;
+ continue;
+ }
+ if (png_ptr->row_buf[0] > PNG_FILTER_VALUE_NONE)
+ {
+ if (png_ptr->row_buf[0] < PNG_FILTER_VALUE_LAST)
+ png_read_filter_row(png_ptr, &row_info, png_ptr->row_buf + 1,
+ png_ptr->prev_row + 1, png_ptr->row_buf[0]);
+ else
+ png_debug1(1, "bad adaptive filter value %d", png_ptr->row_buf[0]);
+ }
+
+ memcpy(png_ptr->prev_row, png_ptr->row_buf, row_info_in.rowbytes + 1);
+
+#ifdef PNG_MNG_FEATURES_SUPPORTED
+ if ((png_ptr->mng_features_permitted & PNG_FLAG_MNG_FILTER_64) != 0 &&
+ (png_ptr->filter_type == PNG_INTRAPIXEL_DIFFERENCING))
+ {
+ /* Intrapixel differencing */
+ png_do_read_intrapixel(&row_info, png_ptr->row_buf + 1);
+ }
+#endif
+
+#ifdef PNG_READ_TRANSFORMS_SUPPORTED
+ if (png_ptr->transformations
+# ifdef PNG_CHECK_FOR_INVALID_INDEX_SUPPORTED
+ || png_ptr->num_palette_max >= 0
+# endif
+ )
+ png_do_read_transformations(png_ptr, &row_info);
+#endif
+
+ /* The transformed pixel depth should match the depth now in row_info. */
+ if (png_ptr->transformed_pixel_depth == 0)
+ {
+ png_ptr->transformed_pixel_depth = row_info.pixel_depth;
+ if (row_info.pixel_depth > png_ptr->maximum_pixel_depth)
+ png_error(png_ptr, "sequential row overflow");
+ }
+
+ else if (png_ptr->transformed_pixel_depth != row_info.pixel_depth)
+ png_error(png_ptr, "internal sequential row size calculation error");
+
+ if (rows[i] != NULL)
+ png_combine_row(png_ptr, rows[i], -1);
+
+ png_read_finish_row(png_ptr);
+
+ if (png_ptr->read_row_fn != NULL)
+ (*(png_ptr->read_row_fn))(png_ptr, png_ptr->row_number, png_ptr->pass);
+
+ png_ptr->row_buf = png_ptr->row_buf + row_bytes + 1;
+ }
+ png_ptr->row_buf = temp_row;
+}
+
+static void png_warn_check(png_structrp png_ptr)
+{
+#ifdef PNG_WARNINGS_SUPPORTED
+ /* Check for transforms that have been set but were defined out */
+#if defined(PNG_WRITE_INVERT_SUPPORTED) && !defined(PNG_READ_INVERT_SUPPORTED)
+ if ((png_ptr->transformations & PNG_INVERT_MONO) != 0)
+ png_warning(png_ptr, "PNG_READ_INVERT_SUPPORTED is not defined");
+#endif
+
+#if defined(PNG_WRITE_FILLER_SUPPORTED) && !defined(PNG_READ_FILLER_SUPPORTED)
+ if ((png_ptr->transformations & PNG_FILLER) != 0)
+ png_warning(png_ptr, "PNG_READ_FILLER_SUPPORTED is not defined");
+#endif
+
+#if defined(PNG_WRITE_PACKSWAP_SUPPORTED) && \
+ !defined(PNG_READ_PACKSWAP_SUPPORTED)
+ if ((png_ptr->transformations & PNG_PACKSWAP) != 0)
+ png_warning(png_ptr, "PNG_READ_PACKSWAP_SUPPORTED is not defined");
+#endif
+
+#if defined(PNG_WRITE_PACK_SUPPORTED) && !defined(PNG_READ_PACK_SUPPORTED)
+ if ((png_ptr->transformations & PNG_PACK) != 0)
+ png_warning(png_ptr, "PNG_READ_PACK_SUPPORTED is not defined");
+#endif
+
+#if defined(PNG_WRITE_SHIFT_SUPPORTED) && !defined(PNG_READ_SHIFT_SUPPORTED)
+ if ((png_ptr->transformations & PNG_SHIFT) != 0)
+ png_warning(png_ptr, "PNG_READ_SHIFT_SUPPORTED is not defined");
+#endif
+
+#if defined(PNG_WRITE_BGR_SUPPORTED) && !defined(PNG_READ_BGR_SUPPORTED)
+ if ((png_ptr->transformations & PNG_BGR) != 0)
+ png_warning(png_ptr, "PNG_READ_BGR_SUPPORTED is not defined");
+#endif
+
+#if defined(PNG_WRITE_SWAP_SUPPORTED) && !defined(PNG_READ_SWAP_SUPPORTED)
+ if ((png_ptr->transformations & PNG_SWAP_BYTES) != 0)
+ png_warning(png_ptr, "PNG_READ_SWAP_SUPPORTED is not defined");
+#endif
+#endif /* WARNINGS */
+}
+#endif // PNG_MULTY_LINE_ENABLE
+
/* Read the entire image. If the image has an alpha channel or a tRNS
* chunk, and you have called png_handle_alpha()[*], you will need to
* initialize the image to the current image that PNG will be overlaying.
@@ -745,13 +968,45 @@ png_read_image(png_structrp png_ptr, png_bytepp image)
image_height=png_ptr->height;
- for (j = 0; j < pass; j++)
- {
+#ifdef PNG_MULTY_LINE_ENABLE
+ // OH ISSUE: png optimize
+ if (png_ptr->interlaced == 0 && png_ptr->bit_depth == 8 &&
+ (png_ptr->transformations & PNG_CHECK) == 0) {
+ if ((png_ptr->flags & PNG_FLAG_ROW_INIT) == 0)
+ png_read_start_row(png_ptr);
+
+ png_warn_check(png_ptr);
+ png_row_info row_info;
+ row_info.width = png_ptr->iwidth;
+ row_info.color_type = png_ptr->color_type;
+ row_info.bit_depth = png_ptr->bit_depth;
+ row_info.channels = png_ptr->channels;
+ row_info.pixel_depth = png_ptr->pixel_depth;
+ row_info.rowbytes = png_ptr->rowbytes;
+
rp = image;
- for (i = 0; i < image_height; i++)
+ int row_num = PNG_INFLATE_ROWS;
+ for (i = 0; i < image_height; i += PNG_INFLATE_ROWS)
{
- png_read_row(png_ptr, *rp, NULL);
- rp++;
+ if (image_height - i < PNG_INFLATE_ROWS)
+ {
+ row_num = image_height - i;
+ }
+ png_read_muilty_rows(png_ptr, rp, row_num, row_info);
+ rp += row_num;
+ }
+ }
+ else
+#endif
+ {
+ for (j = 0; j < pass; j++)
+ {
+ rp = image;
+ for (i = 0; i < image_height; i++)
+ {
+ png_read_row(png_ptr, *rp, NULL);
+ rp++;
+ }
}
}
}
diff --git a/pngrutil.c b/pngrutil.c
index 9ac8ec11f..8afdf4fa5 100644
--- a/pngrutil.c
+++ b/pngrutil.c
@@ -4134,7 +4134,12 @@ png_read_filter_row(png_structrp pp, png_row_infop row_info, png_bytep row,
* PNG_FILTER_OPTIMIZATIONS to a function that overrides the generic
* implementations. See png_init_filter_functions above.
*/
+#ifdef PNG_MULTY_LINE_ENABLE
+ // OH ISSUE: png optimize
+ if (filter > PNG_FILTER_VALUE_NONE && filter < PNG_FILTER_VALUE_LAST_X2)
+#else
if (filter > PNG_FILTER_VALUE_NONE && filter < PNG_FILTER_VALUE_LAST)
+#endif
{
if (pp->read_filter[0] == NULL)
png_init_filter_functions(pp);
@@ -4606,7 +4611,24 @@ defined(PNG_USER_TRANSFORM_PTR_SUPPORTED)
row_bytes + 48);
else
+ {
+#ifdef PNG_MULTY_LINE_ENABLE
+ // OH ISSUE: png optimize
+ png_uint_32 row_num = 1;
+ if (png_ptr->bit_depth == 8 &&
+ (png_ptr->transformations & PNG_CHECK) == 0)
+ {
+ row_num = png_ptr->height < PNG_INFLATE_ROWS ?
+ png_ptr->height : PNG_INFLATE_ROWS;
+ }
+ png_ptr->big_row_buf = (png_bytep)png_malloc(
+ png_ptr, row_bytes * row_num + 48);
+ if (png_ptr->big_row_buf == NULL)
+ png_error(png_ptr, "png_malloc failed");
+#else
png_ptr->big_row_buf = (png_bytep)png_malloc(png_ptr, row_bytes + 48);
+#endif
+ }
png_ptr->big_prev_row = (png_bytep)png_malloc(png_ptr, row_bytes + 48);
diff --git a/pngstruct.h b/pngstruct.h
index e591d94d5..7c3846475 100644
--- a/pngstruct.h
+++ b/pngstruct.h
@@ -140,6 +140,14 @@ typedef const png_colorspace * PNG_RESTRICT png_const_colorspacerp;
#define PNG_COLORSPACE_CANCEL(flags) (0xffff ^ (flags))
#endif /* COLORSPACE || GAMMA */
+#ifdef PNG_MULTY_LINE_ENABLE
+// OH ISSUE: png optimize
+#define PNG_FILTER_VALUE_UP_X2 (6) // PNG_FILTER_VALUE_UP + 4
+#define PNG_FILTER_VALUE_AVG_X2 (7) // PNG_FILTER_VALUE_AVG + 4
+#define PNG_FILTER_VALUE_PAETH_X2 (8) // PNG_FILTER_VALUE_PAETH + 4
+#define PNG_FILTER_VALUE_LAST_X2 (9) // PNG_FILTER_VALUE_LAST + 4
+#endif
+
struct png_struct_def
{
#ifdef PNG_SETJMP_SUPPORTED
@@ -467,8 +475,14 @@ struct png_struct_def
png_bytep big_prev_row;
/* New member added in libpng-1.5.7 */
+#ifdef PNG_MULTY_LINE_ENABLE
+ // OH ISSUE: png optimize
+ void (*read_filter[PNG_FILTER_VALUE_LAST_X2 - 1])(png_row_infop row_info,
+ png_bytep row, png_const_bytep prev_row);
+#else
void (*read_filter[PNG_FILTER_VALUE_LAST-1])(png_row_infop row_info,
png_bytep row, png_const_bytep prev_row);
+#endif
#ifdef PNG_READ_SUPPORTED
#if defined(PNG_COLORSPACE_SUPPORTED) || defined(PNG_GAMMA_SUPPORTED)
diff --git a/pngtrans.c b/pngtrans.c
index 1100f46eb..99736747a 100644
--- a/pngtrans.c
+++ b/pngtrans.c
@@ -13,6 +13,19 @@
#include "pngpriv.h"
+#ifdef PNG_MULTY_LINE_ENABLE
+# if defined(_MSC_VER) && !defined(__clang__) && defined(_M_ARM64)
+# include <arm64_neon.h>
+# else
+# include <arm_neon.h>
+# endif
+# define STEP_GRAY (16)
+# define STEP_GA (32)
+# define STEP_RGB (48)
+# define STEP_RGBA (64)
+# define INDEX2 (2)
+#endif
+
#if defined(PNG_READ_SUPPORTED) || defined(PNG_WRITE_SUPPORTED)
#if defined(PNG_READ_BGR_SUPPORTED) || defined(PNG_WRITE_BGR_SUPPORTED)
@@ -269,13 +282,19 @@ png_do_invert(png_row_infop row_info, png_bytep row)
if (row_info->color_type == PNG_COLOR_TYPE_GRAY)
{
png_bytep rp = row;
- size_t i;
- size_t istop = row_info->rowbytes;
-
- for (i = 0; i < istop; i++)
+ png_bytep rp_stop = row + row_info->rowbytes;
+#ifdef PNG_MULTY_LINE_ENABLE
+ png_bytep rp_stop_neon = rp_stop - STEP_GRAY;
+ for (; rp < rp_stop_neon; rp += STEP_GRAY)
+ {
+ uint8x16_t gray = vld1q_u8(rp);
+ gray = ~gray;
+ vst1q_u8(rp, gray);
+ }
+#endif
+ for (; rp < rp_stop; rp++)
{
*rp = (png_byte)(~(*rp));
- rp++;
}
}
@@ -283,13 +302,19 @@ png_do_invert(png_row_infop row_info, png_bytep row)
row_info->bit_depth == 8)
{
png_bytep rp = row;
- size_t i;
- size_t istop = row_info->rowbytes;
-
- for (i = 0; i < istop; i += 2)
+ png_bytep rp_stop = row + row_info->rowbytes;
+#ifdef PNG_MULTY_LINE_ENABLE
+ png_bytep rp_stop_neon = rp_stop - STEP_GA;
+ for (; rp < rp_stop_neon; rp += STEP_GA)
+ {
+ uint8x16x2_t gray_alpha = vld2q_u8(rp);
+ gray_alpha.val[0] = ~gray_alpha.val[0];
+ vst2q_u8(rp, gray_alpha);
+ }
+#endif
+ for (; rp < rp_stop; rp += 2)
{
*rp = (png_byte)(~(*rp));
- rp += 2;
}
}
@@ -298,14 +323,21 @@ png_do_invert(png_row_infop row_info, png_bytep row)
row_info->bit_depth == 16)
{
png_bytep rp = row;
- size_t i;
- size_t istop = row_info->rowbytes;
-
- for (i = 0; i < istop; i += 4)
+ png_bytep rp_stop = row + row_info->rowbytes;
+#ifdef PNG_MULTY_LINE_ENABLE
+ png_bytep rp_stop_neon = rp_stop - STEP_RGBA;
+ for (; rp < rp_stop_neon; rp += STEP_RGBA)
+ {
+ uint8x16x4_t gray_alpha = vld4q_u8(rp);
+ gray_alpha.val[0] = ~gray_alpha.val[0];
+ gray_alpha.val[1] = ~gray_alpha.val[1];
+ vst4q_u8(rp, gray_alpha);
+ }
+#endif
+ for (; rp < rp_stop; rp += 4)
{
*rp = (png_byte)(~(*rp));
*(rp + 1) = (png_byte)(~(*(rp + 1)));
- rp += 4;
}
}
#endif
@@ -323,10 +355,19 @@ png_do_swap(png_row_infop row_info, png_bytep row)
if (row_info->bit_depth == 16)
{
png_bytep rp = row;
- png_uint_32 i;
- png_uint_32 istop= row_info->width * row_info->channels;
-
- for (i = 0; i < istop; i++, rp += 2)
+ png_bytep rp_stop = row + row_info->rowbytes;
+#ifdef PNG_MULTY_LINE_ENABLE
+ png_bytep rp_stop_neon = rp_stop - STEP_GA;
+ for (; rp < rp_stop_neon; rp += STEP_GA)
+ {
+ uint8x16x2_t gray = vld2q_u8(rp);
+ uint8x16_t tmp = gray.val[0];
+ gray.val[0] = gray.val[1];
+ gray.val[1] = tmp;
+ vst2q_u8(rp, gray);
+ }
+#endif
+ for (; rp < rp_stop; rp += 2)
{
#ifdef PNG_BUILTIN_BSWAP16_SUPPORTED
/* Feature added to libpng-1.6.11 for testing purposes, not
@@ -622,15 +663,24 @@ png_do_bgr(png_row_infop row_info, png_bytep row)
if ((row_info->color_type & PNG_COLOR_MASK_COLOR) != 0)
{
- png_uint_32 row_width = row_info->width;
if (row_info->bit_depth == 8)
{
if (row_info->color_type == PNG_COLOR_TYPE_RGB)
{
- png_bytep rp;
- png_uint_32 i;
-
- for (i = 0, rp = row; i < row_width; i++, rp += 3)
+ png_bytep rp = row;
+ png_bytep rp_stop = row + row_info->rowbytes;
+#ifdef PNG_MULTY_LINE_ENABLE
+ png_bytep rp_stop_neon = rp_stop - STEP_RGB;
+ for (; rp < rp_stop_neon; rp += STEP_RGB)
+ {
+ uint8x16x3_t bgr = vld3q_u8(rp);
+ uint8x16_t tmp = bgr.val[INDEX2];
+ bgr.val[INDEX2] = bgr.val[0];
+ bgr.val[0] = tmp;
+ vst3q_u8(rp, bgr);
+ }
+#endif
+ for (; rp < rp_stop; rp += 3)
{
png_byte save = *rp;
*rp = *(rp + 2);
@@ -640,10 +690,20 @@ png_do_bgr(png_row_infop row_info, png_bytep row)
else if (row_info->color_type == PNG_COLOR_TYPE_RGB_ALPHA)
{
- png_bytep rp;
- png_uint_32 i;
-
- for (i = 0, rp = row; i < row_width; i++, rp += 4)
+ png_bytep rp = row;
+ png_bytep rp_stop = row + row_info->rowbytes;
+#ifdef PNG_MULTY_LINE_ENABLE
+ png_bytep rp_stop_neon = rp_stop - STEP_RGBA;
+ for (; rp < rp_stop_neon; rp += STEP_RGBA)
+ {
+ uint8x16x4_t bgra = vld4q_u8(rp);
+ uint8x16_t tmp = bgra.val[INDEX2];
+ bgra.val[INDEX2] = bgra.val[0];
+ bgra.val[0] = tmp;
+ vst4q_u8(rp, bgra);
+ }
+#endif
+ for (; rp < rp_stop; rp += 4)
{
png_byte save = *rp;
*rp = *(rp + 2);
@@ -657,10 +717,20 @@ png_do_bgr(png_row_infop row_info, png_bytep row)
{
if (row_info->color_type == PNG_COLOR_TYPE_RGB)
{
- png_bytep rp;
- png_uint_32 i;
-
- for (i = 0, rp = row; i < row_width; i++, rp += 6)
+ png_bytep rp = row;
+ png_bytep rp_stop = row + row_info->rowbytes;
+#ifdef PNG_MULTY_LINE_ENABLE
+ png_bytep rp_stop_neon = rp_stop - STEP_RGB;
+ for (; rp < rp_stop_neon; rp += STEP_RGB)
+ {
+ uint16x8x3_t bgr = vld3q_u16((unsigned short *)rp);
+ uint16x8_t tmp = bgr.val[INDEX2];
+ bgr.val[INDEX2] = bgr.val[0];
+ bgr.val[0] = tmp;
+ vst3q_u16((unsigned short *)rp, bgr);
+ }
+#endif
+ for (; rp < rp_stop; rp += 6)
{
png_byte save = *rp;
*rp = *(rp + 4);
@@ -673,10 +743,20 @@ png_do_bgr(png_row_infop row_info, png_bytep row)
else if (row_info->color_type == PNG_COLOR_TYPE_RGB_ALPHA)
{
- png_bytep rp;
- png_uint_32 i;
-
- for (i = 0, rp = row; i < row_width; i++, rp += 8)
+ png_bytep rp = row;
+ png_bytep rp_stop = row + row_info->rowbytes;
+#ifdef PNG_MULTY_LINE_ENABLE
+ png_bytep rp_stop_neon = rp_stop - STEP_RGBA;
+ for (; rp < rp_stop_neon; rp += STEP_RGBA)
+ {
+ uint16x8x4_t bgra = vld4q_u16((unsigned short *)rp);
+ uint16x8_t tmp = bgra.val[INDEX2];
+ bgra.val[INDEX2] = bgra.val[0];
+ bgra.val[0] = tmp;
+ vst4q_u16((unsigned short *)rp, bgra);
+ }
+#endif
+ for (; rp < rp_stop; rp += 8)
{
png_byte save = *rp;
*rp = *(rp + 4);
diff --git a/pngwutil.c b/pngwutil.c
index 16345e4c0..212c090b6 100644
--- a/pngwutil.c
+++ b/pngwutil.c
@@ -16,6 +16,20 @@
#ifdef PNG_WRITE_SUPPORTED
#ifdef PNG_WRITE_INT_FUNCTIONS_SUPPORTED
+#ifdef PNG_WRITE_NEON_ENABLE
+// OH ISSUE: png optimize
+# if defined(_MSC_VER) && !defined(__clang__) && defined(_M_ARM64)
+# include <arm64_neon.h>
+# else
+# include <arm_neon.h>
+# endif
+# define STEP 16
+# define MID 128
+# define SHIFT_RGB 13
+# define SHIFT_RGBA 12
+# define BYTE_RGB 3
+# define BYTE_RGBA 4
+#endif
/* Place a 32-bit number into a buffer in PNG byte order. We work
* with unsigned numbers for convenience, although one supported
* ancillary chunk uses signed (two's complement) numbers.
@@ -2275,10 +2289,939 @@ png_write_filtered_row(png_structrp png_ptr, png_bytep filtered_row,
size_t row_bytes);
#ifdef PNG_WRITE_FILTER_SUPPORTED
+#ifdef PNG_WRITE_NEON_ENABLE
+size_t png_write_filter_sub3_neon(png_structrp png_ptr,
+ size_t row_bytes, size_t lmins)
+{
+ size_t sum = 0;
+ png_bytep rp = png_ptr->row_buf + 1;
+ png_bytep dp = png_ptr->try_row + 1;
+ png_ptr->try_row[0] = PNG_FILTER_VALUE_SUB;
+
+ size_t count = row_bytes;
+ uint8x16_t tmp = vdupq_n_u8(0);
+ while (count >= STEP)
+ {
+ uint8x16_t qrp = vld1q_u8(rp);
+ uint8x16_t qlp = vextq_u8(tmp, qrp, SHIFT_RGB);
+ uint8x16_t qdp = vsubq_u8(qrp, qlp);
+ vst1q_u8(dp, qdp);
+ tmp = qrp;
+ int8x16_t v_s = vreinterpretq_s8_u8(qdp);
+ v_s = vabsq_s8(v_s);
+ uint8x16_t v_u = vreinterpretq_u8_s8(v_s);
+ sum += vaddlvq_u8(v_u);
+ rp += STEP;
+ dp += STEP;
+ count -= STEP;
+ if (sum > lmins)
+ {
+ return sum;
+ }
+ }
+
+ if (count == row_bytes)
+ {
+ dp[0] = rp[0];
+ dp[1] = rp[1];
+ dp[2] = rp[2];
+ sum += MID - abs((int)dp[0] - MID);
+ sum += MID - abs((int)dp[1] - MID);
+ sum += MID - abs((int)dp[2] - MID);
+ rp += BYTE_RGB;
+ dp += BYTE_RGB;
+ count -= BYTE_RGB;
+ }
+
+ png_bytep lp = rp - BYTE_RGB;
+ while (count > 0)
+ {
+ *dp = (png_byte)(((int)*rp++ - (int)*lp++) & 0xff);
+ sum += MID - abs((int)*dp++ - MID);
+ count--;
+ if (sum > lmins)
+ {
+ return sum;
+ }
+ }
+ return sum;
+}
+
+void png_write_filter_sub3_neon_only(png_structrp png_ptr,
+ size_t row_bytes)
+{
+ png_bytep rp = png_ptr->row_buf + 1;
+ png_bytep dp = png_ptr->try_row + 1;
+ png_ptr->try_row[0] = PNG_FILTER_VALUE_SUB;
+
+ size_t count = row_bytes;
+ uint8x16_t tmp = vdupq_n_u8(0);
+ while (count >= STEP)
+ {
+ uint8x16_t qrp = vld1q_u8(rp);
+ uint8x16_t qlp = vextq_u8(tmp, qrp, SHIFT_RGB);
+ uint8x16_t qdp = vsubq_u8(qrp, qlp);
+ vst1q_u8(dp, qdp);
+ tmp = qrp;
+ rp += STEP;
+ dp += STEP;
+ count -= STEP;
+ }
+
+ if (count == row_bytes)
+ {
+ dp[0] = rp[0];
+ dp[1] = rp[1];
+ dp[2] = rp[2];
+ rp += BYTE_RGB;
+ dp += BYTE_RGB;
+ count -= BYTE_RGB;
+ }
+
+ png_bytep lp = rp - BYTE_RGB;
+ while (count > 0)
+ {
+ *dp++ = (png_byte)(((int)*rp++ - (int)*lp++) & 0xff);
+ count--;
+ }
+}
+
+size_t png_write_filter_sub4_neon(png_structrp png_ptr,
+ size_t row_bytes, size_t lmins)
+{
+ size_t sum = 0;
+ png_bytep rp = png_ptr->row_buf + 1;
+ png_bytep dp = png_ptr->try_row + 1;
+ png_ptr->try_row[0] = PNG_FILTER_VALUE_SUB;
+
+ size_t count = row_bytes;
+ uint8x16_t tmp = vdupq_n_u8(0);
+ while (count >= STEP)
+ {
+ uint8x16_t qrp = vld1q_u8(rp);
+ uint8x16_t qlp = vextq_u8(tmp, qrp, SHIFT_RGBA);
+ uint8x16_t qdp = vsubq_u8(qrp, qlp);
+ vst1q_u8(dp, qdp);
+ tmp = qrp;
+ int8x16_t v_s = vreinterpretq_s8_u8(qdp);
+ v_s = vabsq_s8(v_s);
+ uint8x16_t v_u = vreinterpretq_u8_s8(v_s);
+ sum += vaddlvq_u8(v_u);
+ rp += STEP;
+ dp += STEP;
+ count -= STEP;
+ if (sum > lmins)
+ {
+ return sum;
+ }
+ }
+
+ if (count == row_bytes)
+ {
+ dp[0] = rp[0];
+ dp[1] = rp[1];
+ dp[2] = rp[2];
+ dp[3] = rp[3];
+ sum += MID - abs((int)dp[0] - MID);
+ sum += MID - abs((int)dp[1] - MID);
+ sum += MID - abs((int)dp[2] - MID);
+ sum += MID - abs((int)dp[3] - MID);
+ rp += BYTE_RGBA;
+ dp += BYTE_RGBA;
+ count -= BYTE_RGBA;
+ }
+
+ png_bytep lp = rp - BYTE_RGBA;
+ while (count > 0)
+ {
+ *dp = (png_byte)(((int)*rp++ - (int)*lp++) & 0xff);
+ sum += MID - abs((int)*dp++ - MID);
+ count--;
+ if (sum > lmins)
+ {
+ return sum;
+ }
+ }
+ return sum;
+}
+
+void png_write_filter_sub4_neon_only(png_structrp png_ptr,
+ size_t row_bytes)
+{
+ png_bytep rp = png_ptr->row_buf + 1;
+ png_bytep dp = png_ptr->try_row + 1;
+ png_ptr->try_row[0] = PNG_FILTER_VALUE_SUB;
+
+ size_t count = row_bytes;
+ uint8x16_t tmp = vdupq_n_u8(0);
+ while (count >= STEP)
+ {
+ uint8x16_t qrp = vld1q_u8(rp);
+ uint8x16_t qlp = vextq_u8(tmp, qrp, SHIFT_RGBA);
+ uint8x16_t qdp = vsubq_u8(qrp, qlp);
+ vst1q_u8(dp, qdp);
+ tmp = qrp;
+ rp += STEP;
+ dp += STEP;
+ count -= STEP;
+ }
+
+ if (count == row_bytes)
+ {
+ dp[0] = rp[0];
+ dp[1] = rp[1];
+ dp[2] = rp[2];
+ dp[3] = rp[3];
+ rp += BYTE_RGBA;
+ dp += BYTE_RGBA;
+ count -= BYTE_RGBA;
+ }
+
+ png_bytep lp = rp - BYTE_RGBA;
+ while (count > 0)
+ {
+ *dp++ = (png_byte)(((int)*rp++ - (int)*lp++) & 0xff);
+ count--;
+ }
+}
+
+size_t png_write_filter_up_neon(png_structrp png_ptr,
+ size_t row_bytes, size_t lmins)
+{
+ size_t sum = 0;
+ png_bytep rp = png_ptr->row_buf + 1;
+ png_bytep dp = png_ptr->try_row + 1;
+ png_bytep pp = png_ptr->prev_row + 1;
+ png_ptr->try_row[0] = PNG_FILTER_VALUE_UP;
+
+ size_t count = row_bytes;
+ while (count >= STEP)
+ {
+ uint8x16_t qrp = vld1q_u8(rp);
+ uint8x16_t qpp = vld1q_u8(pp);
+ uint8x16_t qdp = vsubq_u8(qrp, qpp);
+ vst1q_u8(dp, qdp);
+ int8x16_t v_s = vreinterpretq_s8_u8(qdp);
+ v_s = vabsq_s8(v_s);
+ uint8x16_t v_u = vreinterpretq_u8_s8(v_s);
+ sum += vaddlvq_u8(v_u);
+ rp += STEP;
+ pp += STEP;
+ dp += STEP;
+ count -= STEP;
+ if (sum > lmins)
+ {
+ return sum;
+ }
+ }
+
+ while (count > 0)
+ {
+ *dp = (png_byte)(((int)*rp++ - (int)*pp++) & 0xff);
+ sum += MID - abs((int)*dp++ - MID);
+ count--;
+ if (sum > lmins)
+ {
+ return sum;
+ }
+ }
+ return sum;
+}
+
+void png_write_filter_up_neon_only(png_structrp png_ptr, size_t row_bytes)
+{
+ png_bytep rp = png_ptr->row_buf + 1;
+ png_bytep dp = png_ptr->try_row + 1;
+ png_bytep pp = png_ptr->prev_row + 1;
+ png_ptr->try_row[0] = PNG_FILTER_VALUE_UP;
+
+ size_t count = row_bytes;
+ while (count >= STEP)
+ {
+ uint8x16_t qrp = vld1q_u8(rp);
+ uint8x16_t qpp = vld1q_u8(pp);
+ uint8x16_t qdp = vsubq_u8(qrp, qpp);
+ vst1q_u8(dp, qdp);
+ rp += STEP;
+ pp += STEP;
+ dp += STEP;
+ count -= STEP;
+ }
+
+ while (count > 0)
+ {
+ *dp++ = (png_byte)(((int)*rp++ - (int)*pp++) & 0xff);
+ count--;
+ }
+}
+
+size_t png_write_filter_avg3_neon(png_structrp png_ptr,
+ size_t row_bytes, size_t lmins)
+{
+ size_t sum = 0;
+ png_bytep rp = png_ptr->row_buf + 1;
+ png_bytep pp = png_ptr->prev_row + 1;
+ png_bytep dp = png_ptr->try_row + 1;
+ png_ptr->try_row[0] = PNG_FILTER_VALUE_AVG;
+
+ size_t count = row_bytes;
+ uint8x16_t tmp = vdupq_n_u8(0);
+ while (count >= STEP)
+ {
+ uint8x16_t qrp = vld1q_u8(rp);
+ uint8x16_t qpp = vld1q_u8(pp);
+ uint8x16_t qlp = vextq_u8(tmp, qrp, SHIFT_RGB);
+ qlp = vhaddq_u8(qpp, qlp);
+ uint8x16_t qdp = vsubq_u8(qrp, qlp);
+ vst1q_u8(dp, qdp);
+ int8x16_t v_s = vreinterpretq_s8_u8(qdp);
+ v_s = vabsq_s8(v_s);
+ uint8x16_t v_u = vreinterpretq_u8_s8(v_s);
+ sum += vaddlvq_u8(v_u);
+ tmp = qrp;
+ rp += STEP;
+ pp += STEP;
+ dp += STEP;
+ count -= STEP;
+ if (sum > lmins)
+ {
+ return sum;
+ }
+ }
+
+ if (count == row_bytes)
+ {
+ dp[0] = (png_byte)(((int)rp[0] - ((int)pp[0] / 2)) & 0xff);
+ dp[1] = (png_byte)(((int)rp[1] - ((int)pp[1] / 2)) & 0xff);
+ dp[2] = (png_byte)(((int)rp[2] - ((int)pp[2] / 2)) & 0xff);
+ sum += MID - abs((int)dp[0] - MID);
+ sum += MID - abs((int)dp[1] - MID);
+ sum += MID - abs((int)dp[2] - MID);
+ rp += BYTE_RGB;
+ pp += BYTE_RGB;
+ dp += BYTE_RGB;
+ count -= BYTE_RGB;
+ }
+
+ png_bytep lp = rp - BYTE_RGB;
+ while (count > 0)
+ {
+ *dp = (png_byte)(((int)*rp++ - (((int)*pp++ + (int)*lp++) / 2)) & 0xff);
+ count--;
+ sum += MID - abs((int)*dp++ - MID);
+ if (sum > lmins)
+ {
+ return sum;
+ }
+ }
+ return sum;
+}
+
+void png_write_filter_avg3_neon_only(png_structrp png_ptr,
+ size_t row_bytes)
+{
+ png_bytep rp = png_ptr->row_buf + 1;
+ png_bytep pp = png_ptr->prev_row + 1;
+ png_bytep dp = png_ptr->try_row + 1;
+ png_ptr->try_row[0] = PNG_FILTER_VALUE_AVG;
+
+ size_t count = row_bytes;
+ uint8x16_t tmp = vdupq_n_u8(0);
+ while (count >= STEP)
+ {
+ uint8x16_t qrp = vld1q_u8(rp);
+ uint8x16_t qpp = vld1q_u8(pp);
+ uint8x16_t qlp = vextq_u8(tmp, qrp, SHIFT_RGB);
+ qlp = vhaddq_u8(qpp, qlp);
+ uint8x16_t qdp = vsubq_u8(qrp, qlp);
+ vst1q_u8(dp, qdp);
+ tmp = qrp;
+ rp += STEP;
+ pp += STEP;
+ dp += STEP;
+ count -= STEP;
+ }
+
+ if (count == row_bytes)
+ {
+ dp[0] = (png_byte)(((int)rp[0] - ((int)pp[0] / 2)) & 0xff);
+ dp[1] = (png_byte)(((int)rp[1] - ((int)pp[1] / 2)) & 0xff);
+ dp[2] = (png_byte)(((int)rp[2] - ((int)pp[2] / 2)) & 0xff);
+ rp += BYTE_RGB;
+ pp += BYTE_RGB;
+ dp += BYTE_RGB;
+ count -= BYTE_RGB;
+ }
+
+ png_bytep lp = rp - BYTE_RGB;
+ while (count > 0)
+ {
+ *dp++ = (png_byte)(((int)*rp++ - (((int)*pp++ + (int)*lp++) / 2)) & 0xff);
+ count--;
+ }
+}
+
+size_t png_write_filter_avg4_neon(png_structrp png_ptr,
+ size_t row_bytes, size_t lmins)
+{
+ size_t sum = 0;
+ png_bytep rp = png_ptr->row_buf + 1;
+ png_bytep pp = png_ptr->prev_row + 1;
+ png_bytep dp = png_ptr->try_row + 1;
+ png_ptr->try_row[0] = PNG_FILTER_VALUE_AVG;
+
+ size_t count = row_bytes;
+ uint8x16_t tmp = vdupq_n_u8(0);
+ while (count >= STEP)
+ {
+ uint8x16_t qrp = vld1q_u8(rp);
+ uint8x16_t qpp = vld1q_u8(pp);
+ uint8x16_t qlp = vextq_u8(tmp, qrp, SHIFT_RGBA);
+ qlp = vhaddq_u8(qpp, qlp);
+ uint8x16_t qdp = vsubq_u8(qrp, qlp);
+ vst1q_u8(dp, qdp);
+ int8x16_t v_s = vreinterpretq_s8_u8(qdp);
+ v_s = vabsq_s8(v_s);
+ uint8x16_t v_u = vreinterpretq_u8_s8(v_s);
+ sum += vaddlvq_u8(v_u);
+ tmp = qrp;
+ rp += STEP;
+ pp += STEP;
+ dp += STEP;
+ count -= STEP;
+ if (sum > lmins)
+ {
+ return sum;
+ }
+ }
+
+ if (count == row_bytes)
+ {
+ dp[0] = (png_byte)(((int)rp[0] - ((int)pp[0] / 2)) & 0xff);
+ dp[1] = (png_byte)(((int)rp[1] - ((int)pp[1] / 2)) & 0xff);
+ dp[2] = (png_byte)(((int)rp[2] - ((int)pp[2] / 2)) & 0xff);
+ dp[3] = (png_byte)(((int)rp[3] - ((int)pp[3] / 2)) & 0xff);
+ sum += MID - abs((int)dp[0] - MID);
+ sum += MID - abs((int)dp[1] - MID);
+ sum += MID - abs((int)dp[2] - MID);
+ sum += MID - abs((int)dp[3] - MID);
+ rp += BYTE_RGBA;
+ pp += BYTE_RGBA;
+ dp += BYTE_RGBA;
+ count -= BYTE_RGBA;
+ }
+
+ png_bytep lp = rp - BYTE_RGBA;
+ while (count > 0)
+ {
+ *dp = (png_byte)(((int)*rp++ - (((int)*pp++ + (int)*lp++) / 2)) & 0xff);
+ count--;
+ sum += MID - abs((int)*dp++ - MID);
+ if (sum > lmins)
+ {
+ return sum;
+ }
+ }
+ return sum;
+}
+
+void png_write_filter_avg4_neon_only(png_structrp png_ptr,
+ size_t row_bytes)
+{
+ png_bytep rp = png_ptr->row_buf + 1;
+ png_bytep pp = png_ptr->prev_row + 1;
+ png_bytep dp = png_ptr->try_row + 1;
+ png_ptr->try_row[0] = PNG_FILTER_VALUE_AVG;
+
+ size_t count = row_bytes;
+ uint8x16_t tmp = vdupq_n_u8(0);
+ while (count >= STEP)
+ {
+ uint8x16_t qrp = vld1q_u8(rp);
+ uint8x16_t qpp = vld1q_u8(pp);
+ uint8x16_t qlp = vextq_u8(tmp, qrp, SHIFT_RGBA);
+ qlp = vhaddq_u8(qpp, qlp);
+ uint8x16_t qdp = vsubq_u8(qrp, qlp);
+ vst1q_u8(dp, qdp);
+ tmp = qrp;
+ rp += STEP;
+ pp += STEP;
+ dp += STEP;
+ count -= STEP;
+ }
+
+ if (count == row_bytes)
+ {
+ dp[0] = (png_byte)(((int)rp[0] - ((int)pp[0] / 2)) & 0xff);
+ dp[1] = (png_byte)(((int)rp[1] - ((int)pp[1] / 2)) & 0xff);
+ dp[2] = (png_byte)(((int)rp[2] - ((int)pp[2] / 2)) & 0xff);
+ dp[3] = (png_byte)(((int)rp[3] - ((int)pp[3] / 2)) & 0xff);
+ rp += BYTE_RGBA;
+ pp += BYTE_RGBA;
+ dp += BYTE_RGBA;
+ count -= BYTE_RGBA;
+ }
+
+ png_bytep lp = rp - BYTE_RGBA;
+ while (count > 0)
+ {
+ *dp++ = (png_byte)(((int)*rp++ - (((int)*pp++ + (int)*lp++) / 2)) & 0xff);
+ count--;
+ }
+}
+
+size_t png_write_filter_paeth3_neon(png_structrp png_ptr,
+ size_t row_bytes, size_t lmins)
+{
+ size_t sum = 0;
+ png_bytep rp = png_ptr->row_buf + 1;
+ png_bytep pp = png_ptr->prev_row + 1;
+ png_bytep dp = png_ptr->try_row + 1;
+ png_ptr->try_row[0] = PNG_FILTER_VALUE_PAETH;
+
+ size_t count = row_bytes;
+ uint8x16_t tmp_a = vdupq_n_u8(0);
+ uint8x16_t tmp_c = vdupq_n_u8(0);
+ while (count >= STEP)
+ {
+ uint8x16_t qrp = vld1q_u8(rp);
+ uint8x16_t b = vld1q_u8(pp);
+ uint8x16_t a = vextq_u8(tmp_a, qrp, SHIFT_RGB);
+ uint8x16_t c = vextq_u8(tmp_c, b, SHIFT_RGB);
+ tmp_a = qrp;
+ tmp_c = b;
+
+ int16x8_t a_hign = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a)));
+ int16x8_t a_low = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a)));
+ int16x8_t b_hign = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b)));
+ int16x8_t b_low = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b)));
+ int16x8_t c_hign = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(c)));
+ int16x8_t c_low = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(c)));
+
+ int16x8_t p = vsubq_s16(b_hign, c_hign);
+ int16x8_t pc = vsubq_s16(a_hign, c_hign);
+ int16x8_t pa = vabsq_s16(p);
+ int16x8_t pb = vabsq_s16(pc);
+ pc = vabsq_s16(vaddq_s16(p, pc));
+ uint16x8_t p1_u = vcleq_s16(pa, pb);
+ uint16x8_t pa_u = vcleq_s16(pa, pc);
+ uint16x8_t pb_u = vcleq_s16(pb, pc);
+ p1_u = vandq_u16(p1_u, pa_u);
+ uint8x8_t d_hign = vmovn_u16(pb_u);
+ uint8x8_t e_hign = vmovn_u16(p1_u);
+
+ p = vsubq_s16(b_low, c_low);
+ pc = vsubq_s16(a_low, c_low);
+ pa = vabsq_s16(p);
+ pb = vabsq_s16(pc);
+ pc = vabsq_s16(vaddq_s16(p, pc));
+ p1_u = vcleq_s16(pa, pb);
+ pa_u = vcleq_s16(pa, pc);
+ pb_u = vcleq_s16(pb, pc);
+ p1_u = vandq_u16(p1_u, pa_u);
+ uint8x8_t d_low = vmovn_u16(pb_u);
+ uint8x8_t e_low = vmovn_u16(p1_u);
+
+ uint8x16_t d = vcombine_u8(d_low, d_hign);
+ uint8x16_t e = vcombine_u8(e_low, e_hign);
+ d = vbslq_u8(d, b, c);
+ e = vbslq_u8(e, a, d);
+
+ uint8x16_t qdp = vsubq_u8(qrp, e);
+ vst1q_u8(dp, qdp);
+ int8x16_t v_s = vreinterpretq_s8_u8(qdp);
+ v_s = vabsq_s8(v_s);
+ uint8x16_t v_u = vreinterpretq_u8_s8(v_s);
+ sum += vaddlvq_u8(v_u);
+
+ rp += STEP;
+ pp += STEP;
+ dp += STEP;
+ count -= STEP;
+ if (sum > lmins)
+ {
+ return sum;
+ }
+ }
+
+ if (count == row_bytes)
+ {
+ dp[0] = (png_byte)(((int)rp[0] - (int)pp[0]) & 0xff);
+ dp[1] = (png_byte)(((int)rp[1] - (int)pp[1]) & 0xff);
+ dp[2] = (png_byte)(((int)rp[2] - (int)pp[2]) & 0xff);
+ sum += MID - abs((int)dp[0] - MID);
+ sum += MID - abs((int)dp[1] - MID);
+ sum += MID - abs((int)dp[2] - MID);
+ rp += BYTE_RGB;
+ pp += BYTE_RGB;
+ dp += BYTE_RGB;
+ count -= BYTE_RGB;
+ }
+
+ png_bytep cp = pp - BYTE_RGB;
+ png_bytep lp = rp - BYTE_RGB;
+ while (count > 0)
+ {
+ int a, b, c, pa, pb, pc, p;
+
+ b = *pp++;
+ c = *cp++;
+ a = *lp++;
+
+ p = b - c;
+ pc = a - c;
+
+ pa = abs(p);
+ pb = abs(pc);
+ pc = abs(p + pc);
+
+ p = (pa <= pb && pa <=pc) ? a : (pb <= pc) ? b : c;
+ *dp = (png_byte)(((int)*rp++ - p) & 0xff);
+
+ count--;
+ sum += MID - abs((int)*dp++ - MID);
+ if (sum > lmins)
+ {
+ return sum;
+ }
+ }
+ return sum;
+}
+
+void png_write_filter_paeth3_neon_only(png_structrp png_ptr,
+ size_t row_bytes)
+{
+ png_bytep rp = png_ptr->row_buf + 1;
+ png_bytep pp = png_ptr->prev_row + 1;
+ png_bytep dp = png_ptr->try_row + 1;
+ png_ptr->try_row[0] = PNG_FILTER_VALUE_PAETH;
+
+ size_t count = row_bytes;
+ uint8x16_t tmp_a = vdupq_n_u8(0);
+ uint8x16_t tmp_c = vdupq_n_u8(0);
+ while (count >= STEP)
+ {
+ uint8x16_t qrp = vld1q_u8(rp);
+ uint8x16_t b = vld1q_u8(pp);
+ uint8x16_t a = vextq_u8(tmp_a, qrp, SHIFT_RGB);
+ uint8x16_t c = vextq_u8(tmp_c, b, SHIFT_RGB);
+ tmp_a = qrp;
+ tmp_c = b;
+
+ int16x8_t a_hign = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a)));
+ int16x8_t a_low = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a)));
+ int16x8_t b_hign = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b)));
+ int16x8_t b_low = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b)));
+ int16x8_t c_hign = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(c)));
+ int16x8_t c_low = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(c)));
+
+ int16x8_t p = vsubq_s16(b_hign, c_hign);
+ int16x8_t pc = vsubq_s16(a_hign, c_hign);
+ int16x8_t pa = vabsq_s16(p);
+ int16x8_t pb = vabsq_s16(pc);
+ pc = vabsq_s16(vaddq_s16(p, pc));
+ uint16x8_t p1_u = vcleq_s16(pa, pb);
+ uint16x8_t pa_u = vcleq_s16(pa, pc);
+ uint16x8_t pb_u = vcleq_s16(pb, pc);
+ p1_u = vandq_u16(p1_u, pa_u);
+ uint8x8_t d_hign = vmovn_u16(pb_u);
+ uint8x8_t e_hign = vmovn_u16(p1_u);
+
+ p = vsubq_s16(b_low, c_low);
+ pc = vsubq_s16(a_low, c_low);
+ pa = vabsq_s16(p);
+ pb = vabsq_s16(pc);
+ pc = vabsq_s16(vaddq_s16(p, pc));
+ p1_u = vcleq_s16(pa, pb);
+ pa_u = vcleq_s16(pa, pc);
+ pb_u = vcleq_s16(pb, pc);
+ p1_u = vandq_u16(p1_u, pa_u);
+ uint8x8_t d_low = vmovn_u16(pb_u);
+ uint8x8_t e_low = vmovn_u16(p1_u);
+
+ uint8x16_t d = vcombine_u8(d_low, d_hign);
+ uint8x16_t e = vcombine_u8(e_low, e_hign);
+ d = vbslq_u8(d, b, c);
+ e = vbslq_u8(e, a, d);
+
+ uint8x16_t qdp = vsubq_u8(qrp, e);
+ vst1q_u8(dp, qdp);
+
+ rp += STEP;
+ pp += STEP;
+ dp += STEP;
+ count -= STEP;
+ }
+
+ if (count == row_bytes)
+ {
+ dp[0] = (png_byte)(((int)rp[0] - (int)pp[0]) & 0xff);
+ dp[1] = (png_byte)(((int)rp[1] - (int)pp[1]) & 0xff);
+ dp[2] = (png_byte)(((int)rp[2] - (int)pp[2]) & 0xff);
+ rp += BYTE_RGB;
+ pp += BYTE_RGB;
+ dp += BYTE_RGB;
+ count -= BYTE_RGB;
+ }
+
+ png_bytep cp = pp - BYTE_RGB;
+ png_bytep lp = rp - BYTE_RGB;
+ while (count > 0)
+ {
+ int a, b, c, pa, pb, pc, p;
+
+ b = *pp++;
+ c = *cp++;
+ a = *lp++;
+
+ p = b - c;
+ pc = a - c;
+
+ pa = abs(p);
+ pb = abs(pc);
+ pc = abs(p + pc);
+
+ p = (pa <= pb && pa <=pc) ? a : (pb <= pc) ? b : c;
+ *dp++ = (png_byte)(((int)*rp++ - p) & 0xff);
+ count--;
+ }
+}
+
+size_t png_write_filter_paeth4_neon(png_structrp png_ptr,
+ size_t row_bytes, size_t lmins)
+{
+ size_t sum = 0;
+ png_bytep rp = png_ptr->row_buf + 1;
+ png_bytep pp = png_ptr->prev_row + 1;
+ png_bytep dp = png_ptr->try_row + 1;
+ png_ptr->try_row[0] = PNG_FILTER_VALUE_PAETH;
+
+ size_t count = row_bytes;
+ uint8x16_t tmp_a = vdupq_n_u8(0);
+ uint8x16_t tmp_c = vdupq_n_u8(0);
+ while (count >= STEP)
+ {
+ uint8x16_t qrp = vld1q_u8(rp);
+ uint8x16_t b = vld1q_u8(pp);
+ uint8x16_t a = vextq_u8(tmp_a, qrp, SHIFT_RGBA);
+ uint8x16_t c = vextq_u8(tmp_c, b, SHIFT_RGBA);
+ tmp_a = qrp;
+ tmp_c = b;
+
+ int16x8_t a_hign = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a)));
+ int16x8_t a_low = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a)));
+ int16x8_t b_hign = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b)));
+ int16x8_t b_low = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b)));
+ int16x8_t c_hign = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(c)));
+ int16x8_t c_low = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(c)));
+
+ int16x8_t p = vsubq_s16(b_hign, c_hign);
+ int16x8_t pc = vsubq_s16(a_hign, c_hign);
+ int16x8_t pa = vabsq_s16(p);
+ int16x8_t pb = vabsq_s16(pc);
+ pc = vabsq_s16(vaddq_s16(p, pc));
+ uint16x8_t p1_u = vcleq_s16(pa, pb);
+ uint16x8_t pa_u = vcleq_s16(pa, pc);
+ uint16x8_t pb_u = vcleq_s16(pb, pc);
+ p1_u = vandq_u16(p1_u, pa_u);
+ uint8x8_t d_hign = vmovn_u16(pb_u);
+ uint8x8_t e_hign = vmovn_u16(p1_u);
+
+ p = vsubq_s16(b_low, c_low);
+ pc = vsubq_s16(a_low, c_low);
+ pa = vabsq_s16(p);
+ pb = vabsq_s16(pc);
+ pc = vabsq_s16(vaddq_s16(p, pc));
+ p1_u = vcleq_s16(pa, pb);
+ pa_u = vcleq_s16(pa, pc);
+ pb_u = vcleq_s16(pb, pc);
+ p1_u = vandq_u16(p1_u, pa_u);
+ uint8x8_t d_low = vmovn_u16(pb_u);
+ uint8x8_t e_low = vmovn_u16(p1_u);
+
+ uint8x16_t d = vcombine_u8(d_low, d_hign);
+ uint8x16_t e = vcombine_u8(e_low, e_hign);
+ d = vbslq_u8(d, b, c);
+ e = vbslq_u8(e, a, d);
+
+ uint8x16_t qdp = vsubq_u8(qrp, e);
+ vst1q_u8(dp, qdp);
+ int8x16_t v_s = vreinterpretq_s8_u8(qdp);
+ v_s = vabsq_s8(v_s);
+ uint8x16_t v_u = vreinterpretq_u8_s8(v_s);
+ sum += vaddlvq_u8(v_u);
+
+ rp += STEP;
+ pp += STEP;
+ dp += STEP;
+ count -= STEP;
+ if (sum > lmins)
+ {
+ return sum;
+ }
+ }
+
+ if (count == row_bytes)
+ {
+ dp[0] = (png_byte)(((int)rp[0] - (int)pp[0]) & 0xff);
+ dp[1] = (png_byte)(((int)rp[1] - (int)pp[1]) & 0xff);
+ dp[2] = (png_byte)(((int)rp[2] - (int)pp[2]) & 0xff);
+ dp[3] = (png_byte)(((int)rp[3] - (int)pp[3]) & 0xff);
+ sum += MID - abs((int)dp[0] - MID);
+ sum += MID - abs((int)dp[1] - MID);
+ sum += MID - abs((int)dp[2] - MID);
+ sum += MID - abs((int)dp[3] - MID);
+ rp += BYTE_RGBA;
+ pp += BYTE_RGBA;
+ dp += BYTE_RGBA;
+ count -= BYTE_RGBA;
+ }
+
+ png_bytep cp = pp - BYTE_RGBA;
+ png_bytep lp = rp - BYTE_RGBA;
+ while (count > 0)
+ {
+ int a, b, c, pa, pb, pc, p;
+
+ b = *pp++;
+ c = *cp++;
+ a = *lp++;
+
+ p = b - c;
+ pc = a - c;
+
+ pa = abs(p);
+ pb = abs(pc);
+ pc = abs(p + pc);
+
+ p = (pa <= pb && pa <=pc) ? a : (pb <= pc) ? b : c;
+ *dp = (png_byte)(((int)*rp++ - p) & 0xff);
+
+ count--;
+ sum += MID - abs((int)*dp++ - MID);
+ if (sum > lmins)
+ {
+ return sum;
+ }
+ }
+ return sum;
+}
+
+void png_write_filter_paeth4_neon_only(png_structrp png_ptr,
+ size_t row_bytes)
+{
+ png_bytep rp = png_ptr->row_buf + 1;
+ png_bytep pp = png_ptr->prev_row + 1;
+ png_bytep dp = png_ptr->try_row + 1;
+ png_ptr->try_row[0] = PNG_FILTER_VALUE_PAETH;
+
+ size_t count = row_bytes;
+ uint8x16_t tmp_a = vdupq_n_u8(0);
+ uint8x16_t tmp_c = vdupq_n_u8(0);
+ while (count >= STEP)
+ {
+ uint8x16_t qrp = vld1q_u8(rp);
+ uint8x16_t b = vld1q_u8(pp);
+ uint8x16_t a = vextq_u8(tmp_a, qrp, SHIFT_RGBA);
+ uint8x16_t c = vextq_u8(tmp_c, b, SHIFT_RGBA);
+ tmp_a = qrp;
+ tmp_c = b;
+
+ int16x8_t a_hign = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a)));
+ int16x8_t a_low = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a)));
+ int16x8_t b_hign = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b)));
+ int16x8_t b_low = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b)));
+ int16x8_t c_hign = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(c)));
+ int16x8_t c_low = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(c)));
+
+ int16x8_t p = vsubq_s16(b_hign, c_hign);
+ int16x8_t pc = vsubq_s16(a_hign, c_hign);
+ int16x8_t pa = vabsq_s16(p);
+ int16x8_t pb = vabsq_s16(pc);
+ pc = vabsq_s16(vaddq_s16(p, pc));
+ uint16x8_t p1_u = vcleq_s16(pa, pb);
+ uint16x8_t pa_u = vcleq_s16(pa, pc);
+ uint16x8_t pb_u = vcleq_s16(pb, pc);
+ p1_u = vandq_u16(p1_u, pa_u);
+ uint8x8_t d_hign = vmovn_u16(pb_u);
+ uint8x8_t e_hign = vmovn_u16(p1_u);
+
+ p = vsubq_s16(b_low, c_low);
+ pc = vsubq_s16(a_low, c_low);
+ pa = vabsq_s16(p);
+ pb = vabsq_s16(pc);
+ pc = vabsq_s16(vaddq_s16(p, pc));
+ p1_u = vcleq_s16(pa, pb);
+ pa_u = vcleq_s16(pa, pc);
+ pb_u = vcleq_s16(pb, pc);
+ p1_u = vandq_u16(p1_u, pa_u);
+ uint8x8_t d_low = vmovn_u16(pb_u);
+ uint8x8_t e_low = vmovn_u16(p1_u);
+
+ uint8x16_t d = vcombine_u8(d_low, d_hign);
+ uint8x16_t e = vcombine_u8(e_low, e_hign);
+ d = vbslq_u8(d, b, c);
+ e = vbslq_u8(e, a, d);
+
+ uint8x16_t qdp = vsubq_u8(qrp, e);
+ vst1q_u8(dp, qdp);
+
+ rp += STEP;
+ pp += STEP;
+ dp += STEP;
+ count -= STEP;
+ }
+
+ if (count == row_bytes)
+ {
+ dp[0] = (png_byte)(((int)rp[0] - (int)pp[0]) & 0xff);
+ dp[1] = (png_byte)(((int)rp[1] - (int)pp[1]) & 0xff);
+ dp[2] = (png_byte)(((int)rp[2] - (int)pp[2]) & 0xff);
+ dp[3] = (png_byte)(((int)rp[3] - (int)pp[3]) & 0xff);
+ rp += BYTE_RGBA;
+ pp += BYTE_RGBA;
+ dp += BYTE_RGBA;
+ count -= BYTE_RGBA;
+ }
+
+ png_bytep cp = pp - BYTE_RGBA;
+ png_bytep lp = rp - BYTE_RGBA;
+ while (count > 0)
+ {
+ int a, b, c, pa, pb, pc, p;
+
+ b = *pp++;
+ c = *cp++;
+ a = *lp++;
+
+ p = b - c;
+ pc = a - c;
+
+ pa = abs(p);
+ pb = abs(pc);
+ pc = abs(p + pc);
+
+ p = (pa <= pb && pa <=pc) ? a : (pb <= pc) ? b : c;
+ *dp++ = (png_byte)(((int)*rp++ - p) & 0xff);
+ count--;
+ }
+}
+#endif
+
static size_t /* PRIVATE */
png_setup_sub_row(png_structrp png_ptr, png_uint_32 bpp,
size_t row_bytes, size_t lmins)
{
+#ifdef PNG_WRITE_NEON_ENABLE
+ if (bpp == 3)
+ {
+ return png_write_filter_sub3_neon(png_ptr, row_bytes, lmins);
+ }
+ if (bpp == 4)
+ {
+ return png_write_filter_sub4_neon(png_ptr, row_bytes, lmins);
+ }
+#endif
png_bytep rp, dp, lp;
size_t i;
size_t sum = 0;
@@ -2318,6 +3261,16 @@ static void /* PRIVATE */
png_setup_sub_row_only(png_structrp png_ptr, png_uint_32 bpp,
size_t row_bytes)
{
+#ifdef PNG_WRITE_NEON_ENABLE
+ if (bpp == 3)
+ {
+ return png_write_filter_sub3_neon_only(png_ptr, row_bytes);
+ }
+ if (bpp == 4)
+ {
+ return png_write_filter_sub4_neon_only(png_ptr, row_bytes);
+ }
+#endif
png_bytep rp, dp, lp;
size_t i;
@@ -2339,6 +3292,9 @@ png_setup_sub_row_only(png_structrp png_ptr, png_uint_32 bpp,
static size_t /* PRIVATE */
png_setup_up_row(png_structrp png_ptr, size_t row_bytes, size_t lmins)
{
+#ifdef PNG_WRITE_NEON_ENABLE
+ return png_write_filter_up_neon(png_ptr, row_bytes, lmins);
+#endif
png_bytep rp, dp, pp;
size_t i;
size_t sum = 0;
@@ -2366,6 +3322,9 @@ png_setup_up_row(png_structrp png_ptr, size_t row_bytes, size_t lmins)
static void /* PRIVATE */
png_setup_up_row_only(png_structrp png_ptr, size_t row_bytes)
{
+#ifdef PNG_WRITE_NEON_ENABLE
+ return png_write_filter_up_neon_only(png_ptr, row_bytes);
+#endif
png_bytep rp, dp, pp;
size_t i;
@@ -2383,6 +3342,16 @@ static size_t /* PRIVATE */
png_setup_avg_row(png_structrp png_ptr, png_uint_32 bpp,
size_t row_bytes, size_t lmins)
{
+#ifdef PNG_WRITE_NEON_ENABLE
+ if (bpp == 3)
+ {
+ return png_write_filter_avg3_neon(png_ptr, row_bytes, lmins);
+ }
+ if (bpp == 4)
+ {
+ return png_write_filter_avg4_neon(png_ptr, row_bytes, lmins);
+ }
+#endif
png_bytep rp, dp, pp, lp;
png_uint_32 i;
size_t sum = 0;
@@ -2423,6 +3392,16 @@ static void /* PRIVATE */
png_setup_avg_row_only(png_structrp png_ptr, png_uint_32 bpp,
size_t row_bytes)
{
+#ifdef PNG_WRITE_NEON_ENABLE
+ if (bpp == 3)
+ {
+ return png_write_filter_avg3_neon_only(png_ptr, row_bytes);
+ }
+ if (bpp == 4)
+ {
+ return png_write_filter_avg4_neon_only(png_ptr, row_bytes);
+ }
+#endif
png_bytep rp, dp, pp, lp;
png_uint_32 i;
@@ -2445,6 +3424,16 @@ static size_t /* PRIVATE */
png_setup_paeth_row(png_structrp png_ptr, png_uint_32 bpp,
size_t row_bytes, size_t lmins)
{
+#ifdef PNG_WRITE_NEON_ENABLE
+ if (bpp == 3)
+ {
+ return png_write_filter_paeth3_neon(png_ptr, row_bytes, lmins);
+ }
+ if (bpp == 4)
+ {
+ return png_write_filter_paeth4_neon(png_ptr, row_bytes, lmins);
+ }
+#endif
png_bytep rp, dp, pp, cp, lp;
size_t i;
size_t sum = 0;
@@ -2506,6 +3495,16 @@ static void /* PRIVATE */
png_setup_paeth_row_only(png_structrp png_ptr, png_uint_32 bpp,
size_t row_bytes)
{
+#ifdef PNG_WRITE_NEON_ENABLE
+ if (bpp == 3)
+ {
+ return png_write_filter_paeth3_neon_only(png_ptr, row_bytes);
+ }
+ if (bpp == 4)
+ {
+ return png_write_filter_paeth4_neon_only(png_ptr, row_bytes);
+ }
+#endif
png_bytep rp, dp, pp, cp, lp;
size_t i;
@@ -2613,6 +3612,25 @@ png_write_find_filter(png_structrp png_ptr, png_row_infop row_info)
*/
png_bytep rp;
size_t sum = 0;
+#ifdef PNG_WRITE_NEON_ENABLE
+ size_t bytes = row_info->rowbytes;
+ rp = row_buf + 1;
+ while (bytes >= STEP)
+ {
+ uint8x16_t v = vld1q_u8(rp);
+ int8x16_t v_s = vreinterpretq_s8_u8(v);
+ v_s = vabsq_s8(v_s);
+ v = vreinterpretq_u8_s8(v_s);
+ sum += vaddlvq_u8(v);
+ rp += STEP;
+ bytes -= STEP;
+ }
+ while (bytes > 0)
+ {
+ sum += 128 - abs((int)*rp++ - 128);
+ bytes--;
+ }
+#else
size_t i;
unsigned int v;
@@ -2627,7 +3645,7 @@ png_write_find_filter(png_structrp png_ptr, png_row_infop row_info)
#endif
}
}
-
+#endif
mins = sum;
}