summaryrefslogtreecommitdiff
path: root/src/lib/evas/common
diff options
context:
space:
mode:
authorCedric BAIL <cedric@osg.samsung.com>2015-04-28 23:40:04 +0200
committerCedric BAIL <cedric@osg.samsung.com>2015-05-07 09:53:11 +0200
commitad1076525a519ccc53be8a2a6992c870b857bf4a (patch)
tree1e67af59a759daed5468af5905e83d8caf6d1125 /src/lib/evas/common
parent2c2983aadb8cd7351d821c0a5b01efafd445665d (diff)
evas: implement _op_blend_rel_mas_c_dp_neon using NEON intrinsics
Summary: NEON intrinsics can be built both for armv7 and armv8. Reviewers: raster, cedric Reviewed By: cedric Subscribers: cedric Projects: #efl Differential Revision: https://phab.enlightenment.org/D2442 Signed-off-by: Cedric BAIL <cedric@osg.samsung.com>
Diffstat (limited to 'src/lib/evas/common')
-rw-r--r--src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c149
1 files changed, 136 insertions, 13 deletions
diff --git a/src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c b/src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c
index e492bb0..2c0fad7 100644
--- a/src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c
+++ b/src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c
@@ -1,8 +1,6 @@
1#ifdef BUILD_NEON 1#ifdef BUILD_NEON
2#ifdef BUILD_NEON_INTRINSICS
3#include <arm_neon.h> 2#include <arm_neon.h>
4#endif 3#endif
5#endif
6#define NEONDEBUG 0 4#define NEONDEBUG 0
7 5
8 6
@@ -689,19 +687,144 @@ init_blend_mask_color_pt_funcs_neon(void)
689#ifdef BUILD_NEON 687#ifdef BUILD_NEON
690static void 688static void
691_op_blend_rel_mas_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, int l) { 689_op_blend_rel_mas_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, int l) {
692 DATA32 *e; 690 uint16x8_t dc0_16x8;
693 int alpha; 691 uint16x8_t dc1_16x8;
692 uint16x8_t m_16x8;
693 uint16x8_t mc0_16x8;
694 uint16x8_t mc1_16x8;
695 uint16x8_t temp0_16x8;
696 uint16x8_t temp1_16x8;
697 uint16x8_t x255_16x8;
698 uint32x2_t c_32x2;
699 uint32x2_t m_32x2;
700 uint32x4_t a_32x4;
701 uint32x4_t ad_32x4;
702 uint32x4_t cond_32x4;
703 uint32x4_t d_32x4;
704 uint32x4_t dc_32x4;
705 uint32x4_t m_32x4;
706 uint32x4_t temp_32x4;
707 uint32x4_t x0_32x4;
708 uint32x4_t x1_32x4;
709 uint8x16_t a_8x16;
710 uint8x16_t d_8x16;
711 uint8x16_t dc_8x16;
712 uint8x16_t m_8x16;
713 uint8x16_t mc_8x16;
714 uint8x16_t temp_8x16;
715 uint8x16_t x0_8x16;
716 uint8x16_t x1_8x16;
717 uint8x8_t a0_8x8;
718 uint8x8_t a1_8x8;
719 uint8x8_t c_8x8;
720 uint8x8_t d0_8x8;
721 uint8x8_t d1_8x8;
722 uint8x8_t dc0_8x8;
723 uint8x8_t dc1_8x8;
724 uint8x8_t m0_8x8;
725 uint8x8_t m1_8x8;
726 uint8x8_t m_8x8;
727 uint8x8_t mc0_8x8;
728 uint8x8_t mc1_8x8;
729 uint8x8_t temp0_8x8;
730 uint8x8_t temp1_8x8;
694 731
695 DEBUG_FNCOUNT("not"); 732 c_32x2 = vdup_n_u32(c);
733 c_8x8 = vreinterpret_u8_u32(c_32x2);
734 x1_8x16 = vdupq_n_u8(0x1);
735 x1_32x4 = vreinterpretq_u32_u8(x1_8x16);
736 x255_16x8 = vdupq_n_u16(0xff);
737 x0_8x16 = vdupq_n_u8(0x0);
738 x0_32x4 = vreinterpretq_u32_u8(x0_8x16);
696 739
697 UNROLL8_PLD_WHILE(d, l, e, 740 DATA32 *end = d + (l & ~3);
698 { 741 while (d < end)
699 DATA32 mc = MUL_SYM(*m, c); 742 {
700 alpha = 256 - (mc >> 24); 743 // load 4 elements from d
701 *d = MUL_SYM(*d >> 24, mc) + MUL_256(alpha, *d); 744 d_32x4 = vld1q_u32(d);
702 d++; 745 d_8x16 = vreinterpretq_u8_u32(d_32x4);
703 m++; 746 d0_8x8 = vget_low_u8(d_8x16);
704 }); 747 d1_8x8 = vget_high_u8(d_8x16);
748
749 // load 4 elements from m
750 m_32x2 = vld1_lane_u32((DATA32*)m, m_32x2, 0);
751 m_8x8 = vreinterpret_u8_u32(m_32x2);
752 m_16x8 = vmovl_u8(m_8x8);
753 m_8x16 = vreinterpretq_u8_u16(m_16x8);
754 m_8x8 = vget_low_u8(m_8x16);
755 m_16x8 = vmovl_u8(m_8x8);
756 m_32x4 = vreinterpretq_u32_u16(m_16x8);
757
758 m_32x4 = vmulq_u32(m_32x4, x1_32x4);
759 m_8x16 = vreinterpretq_u8_u32(m_32x4);
760 m0_8x8 = vget_low_u8(m_8x16);
761 m1_8x8 = vget_high_u8(m_8x16);
762
763 // multiply MUL_SYM(*m, c)
764 mc0_16x8 = vmull_u8(m0_8x8, c_8x8);
765 mc1_16x8 = vmull_u8(m1_8x8, c_8x8);
766 mc0_16x8 = vaddq_u16(mc0_16x8, x255_16x8);
767 mc1_16x8 = vaddq_u16(mc1_16x8, x255_16x8);
768 mc0_8x8 = vshrn_n_u16(mc0_16x8, 8);
769 mc1_8x8 = vshrn_n_u16(mc1_16x8, 8);
770 mc_8x16 = vcombine_u8(mc0_8x8, mc1_8x8);
771
772 // calculate alpha = 256 - (mc >> 24)
773 a_8x16 = vsubq_u8(x0_8x16, mc_8x16);
774 a_32x4 = vreinterpretq_u32_u8(a_8x16);
775 a_32x4 = vshrq_n_u32(a_32x4, 24);
776 a_32x4 = vmulq_u32(a_32x4, x1_32x4);
777 a_8x16 = vreinterpretq_u8_u32(a_32x4);
778 a0_8x8 = vget_low_u8(a_8x16);
779 a1_8x8 = vget_high_u8(a_8x16);
780
781 // multiply MUL_256(alpha, *d)
782 temp0_16x8 = vmull_u8(a0_8x8, d0_8x8);
783 temp1_16x8 = vmull_u8(a1_8x8, d1_8x8);
784 temp0_8x8 = vshrn_n_u16(temp0_16x8,8);
785 temp1_8x8 = vshrn_n_u16(temp1_16x8,8);
786 temp_8x16 = vcombine_u8(temp0_8x8, temp1_8x8);
787 temp_32x4 = vreinterpretq_u32_u8(temp_8x16);
788
789 // select d where alpha == 0
790 cond_32x4 = vceqq_u32(a_32x4, x0_32x4);
791 ad_32x4 = vbslq_u32(cond_32x4, d_32x4, temp_32x4);
792
793 // shift (*d >> 24)
794 dc_32x4 = vshrq_n_u32(d_32x4, 24);
795 dc_32x4 = vmulq_u32(x1_32x4, dc_32x4);
796 dc_8x16 = vreinterpretq_u8_u32(dc_32x4);
797 dc0_8x8 = vget_low_u8(dc_8x16);
798 dc1_8x8 = vget_high_u8(dc_8x16);
799
800 // multiply MUL_256(*d >> 24, sc);
801 dc0_16x8 = vmull_u8(dc0_8x8, mc0_8x8);
802 dc1_16x8 = vmull_u8(dc1_8x8, mc1_8x8);
803 dc0_16x8 = vaddq_u16(dc0_16x8, x255_16x8);
804 dc1_16x8 = vaddq_u16(dc1_16x8, x255_16x8);
805 dc0_8x8 = vshrn_n_u16(dc0_16x8, 8);
806 dc1_8x8 = vshrn_n_u16(dc1_16x8, 8);
807 dc_8x16 = vcombine_u8(dc0_8x8, dc1_8x8);
808
809 // add up everything
810 dc_32x4 = vreinterpretq_u32_u8(dc_8x16);
811 d_32x4 = vaddq_u32(dc_32x4, ad_32x4);
812
813 // save result
814 vst1q_u32(d, d_32x4);
815 d+=4;
816 m+=4;
817 }
818
819 end += (l & 3);
820 while (d < end)
821 {
822 DATA32 mc = MUL_SYM(*m, c);
823 int alpha = 256 - (mc >> 24);
824 *d = MUL_SYM(*d >> 24, mc) + MUL_256(alpha, *d);
825 d++;
826 m++;
827 }
705} 828}
706 829
707#define _op_blend_rel_mas_cn_dp_neon _op_blend_rel_mas_c_dp_neon 830#define _op_blend_rel_mas_cn_dp_neon _op_blend_rel_mas_c_dp_neon