summaryrefslogtreecommitdiff
path: root/src/lib/evas/common
diff options
context:
space:
mode:
authorCedric BAIL <cedric@osg.samsung.com>2015-04-28 23:39:18 +0200
committerCedric BAIL <cedric@osg.samsung.com>2015-05-07 09:53:11 +0200
commit2c2983aadb8cd7351d821c0a5b01efafd445665d (patch)
tree63a0bb28fb4f9ade66317ab422da27d870116f77 /src/lib/evas/common
parent10ece61dbf6d77d0a42df05c88742114c0ad6ef2 (diff)
evas: implement _op_blend_rel_{p,pan}_dp_neon using NEON intrinsics
Summary: NEON intrinsics can be built both for armv7 and armv8. Reviewers: raster, cedric Subscribers: cedric Projects: #efl Differential Revision: https://phab.enlightenment.org/D2441 Signed-off-by: Cedric BAIL <cedric@osg.samsung.com>
Diffstat (limited to 'src/lib/evas/common')
-rw-r--r--src/lib/evas/common/evas_op_blend/op_blend_pixel_neon.c182
1 files changed, 176 insertions, 6 deletions
diff --git a/src/lib/evas/common/evas_op_blend/op_blend_pixel_neon.c b/src/lib/evas/common/evas_op_blend/op_blend_pixel_neon.c
index 8d70b9d..0db97be 100644
--- a/src/lib/evas/common/evas_op_blend/op_blend_pixel_neon.c
+++ b/src/lib/evas/common/evas_op_blend/op_blend_pixel_neon.c
@@ -1,8 +1,6 @@
1#ifdef BUILD_NEON 1#ifdef BUILD_NEON
2#ifdef BUILD_NEON_INTRINSICS
3#include <arm_neon.h> 2#include <arm_neon.h>
4#endif 3#endif
5#endif
6/* blend pixel --> dst */ 4/* blend pixel --> dst */
7 5
8#ifdef BUILD_NEON 6#ifdef BUILD_NEON
@@ -747,8 +745,114 @@ init_blend_pixel_pt_funcs_neon(void)
747#ifdef BUILD_NEON 745#ifdef BUILD_NEON
748static void 746static void
749_op_blend_rel_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) { 747_op_blend_rel_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
750 DATA32 *e = d + l; 748 uint16x8_t cs0_16x8;
751 while (d < e) { 749 uint16x8_t cs1_16x8;
750 uint16x8_t ld0_16x8;
751 uint16x8_t ld1_16x8;
752 uint32x4_t c_32x4;
753 uint32x4_t cond_32x4;
754 uint32x4_t cs_32x4;
755 uint32x4_t d_32x4;
756 uint32x4_t l_32x4;
757 uint32x4_t ld_32x4;
758 uint32x4_t s_32x4;
759 uint32x4_t x0_32x4;
760 uint32x4_t x1_32x4;
761 uint8x16_t c_8x16;
762 uint8x16_t cs_8x16;
763 uint8x16_t d_8x16;
764 uint8x16_t l_8x16;
765 uint8x16_t ld_8x16;
766 uint8x16_t s_8x16;
767 uint8x16_t x0_8x16;
768 uint8x16_t x1_8x16;
769 uint8x8_t c0_8x8;
770 uint8x8_t c1_8x8;
771 uint8x8_t cs0_8x8;
772 uint8x8_t cs1_8x8;
773 uint8x8_t d0_8x8;
774 uint8x8_t d1_8x8;
775 uint8x8_t l0_8x8;
776 uint8x8_t l1_8x8;
777 uint8x8_t ld0_8x8;
778 uint8x8_t ld1_8x8;
779 uint8x8_t s0_8x8;
780 uint8x8_t s1_8x8;
781
782 x1_8x16 = vdupq_n_u8(0x1);
783 x1_32x4 = vreinterpretq_u32_u8(x1_8x16);
784 x0_8x16 = vdupq_n_u8(0x0);
785 x0_32x4 = vreinterpretq_u32_u8(x0_8x16);
786
787 DATA32 *end = d + (l & ~3);
788 while (d < end)
789 {
790 // load 4 elements from d
791 d_32x4 = vld1q_u32(d);
792 d_8x16 = vreinterpretq_u8_u32(d_32x4);
793 d0_8x8 = vget_low_u8(d_8x16);
794 d1_8x8 = vget_high_u8(d_8x16);
795
796 // load 4 elements from s
797 s_32x4 = vld1q_u32(s);
798 s_8x16 = vreinterpretq_u8_u32(s_32x4);
799 s0_8x8 = vget_low_u8(s_8x16);
800 s1_8x8 = vget_high_u8(s_8x16);
801
802 // calculate l = 256 - (*s >> 24)
803 l_32x4 = vshrq_n_u32(s_32x4, 24);
804 l_32x4 = vmulq_u32(x1_32x4, l_32x4);
805 l_8x16 = vreinterpretq_u8_u32(l_32x4);
806 l_8x16 = vsubq_u8(x0_8x16, l_8x16);
807 l0_8x8 = vget_low_u8(l_8x16);
808 l1_8x8 = vget_high_u8(l_8x16);
809
810 // multiply MUL_256(l, *d)
811 ld0_16x8 = vmull_u8(l0_8x8, d0_8x8);
812 ld1_16x8 = vmull_u8(l1_8x8, d1_8x8);
813 ld0_8x8 = vshrn_n_u16(ld0_16x8,8);
814 ld1_8x8 = vshrn_n_u16(ld1_16x8,8);
815 ld_8x16 = vcombine_u8(ld0_8x8, ld1_8x8);
816 ld_32x4 = vreinterpretq_u32_u8(ld_8x16);
817
818 // select d where l should be 256
819 cond_32x4 = vceqq_u32(l_32x4, x0_32x4);
820 ld_32x4 = vbslq_u32(cond_32x4, d_32x4, ld_32x4);
821
822 // calculate 1 + (*d >> 24)
823 c_32x4 = vshrq_n_u32(d_32x4, 24);
824 c_32x4 = vmulq_u32(x1_32x4, c_32x4);
825 c_8x16 = vreinterpretq_u8_u32(c_32x4);
826 c_8x16 = vaddq_u8(c_8x16, x1_8x16);
827 c0_8x8 = vget_low_u8(c_8x16);
828 c1_8x8 = vget_high_u8(c_8x16);
829
830 // multiply MUL_256(l, *d)
831 cs0_16x8 = vmull_u8(c0_8x8, s0_8x8);
832 cs1_16x8 = vmull_u8(c1_8x8, s1_8x8);
833 cs0_8x8 = vshrn_n_u16(cs0_16x8,8);
834 cs1_8x8 = vshrn_n_u16(cs1_16x8,8);
835 cs_8x16 = vcombine_u8(cs0_8x8, cs1_8x8);
836 cs_32x4 = vreinterpretq_u32_u8(cs_8x16);
837
838 // select s where c should be 256
839 c_32x4 = vreinterpretq_u32_u8(c_8x16);
840 cond_32x4 = vceqq_u32(c_32x4, x0_32x4);
841 cs_32x4 = vbslq_u32(cond_32x4, s_32x4, cs_32x4);
842
843 // add up everything
844 d_32x4 = vaddq_u32(cs_32x4, ld_32x4);
845
846 // save result
847 vst1q_u32(d, d_32x4);
848
849 d+=4;
850 s+=4;
851 }
852
853 end += (l & 3);
854 while (d < end)
855 {
752 l = 256 - (*s >> 24); 856 l = 256 - (*s >> 24);
753 c = 1 + (*d >> 24); 857 c = 1 + (*d >> 24);
754 *d = MUL_256(c, *s) + MUL_256(l, *d); 858 *d = MUL_256(c, *s) + MUL_256(l, *d);
@@ -759,8 +863,74 @@ _op_blend_rel_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
759 863
760static void 864static void
761_op_blend_rel_pan_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) { 865_op_blend_rel_pan_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
762 DATA32 *e = d + l; 866 uint16x8_t cs0_16x8;
763 while (d < e) { 867 uint16x8_t cs1_16x8;
868 uint32x4_t c_32x4;
869 uint32x4_t cond_32x4;
870 uint32x4_t cs_32x4;
871 uint32x4_t d_32x4;
872 uint32x4_t s_32x4;
873 uint32x4_t x0_32x4;
874 uint32x4_t x1_32x4;
875 uint8x16_t c_8x16;
876 uint8x16_t cs_8x16;
877 uint8x16_t s_8x16;
878 uint8x16_t x0_8x16;
879 uint8x16_t x1_8x16;
880 uint8x8_t c0_8x8;
881 uint8x8_t c1_8x8;
882 uint8x8_t cs0_8x8;
883 uint8x8_t cs1_8x8;
884 uint8x8_t s0_8x8;
885 uint8x8_t s1_8x8;
886
887 x1_8x16 = vdupq_n_u8(0x1);
888 x1_32x4 = vreinterpretq_u32_u8(x1_8x16);
889 x0_8x16 = vdupq_n_u8(0x0);
890 x0_32x4 = vreinterpretq_u32_u8(x0_8x16);
891 DATA32 *end = d + (l & ~3);
892 while (d < end)
893 {
894 // load 4 elements from d
895 d_32x4 = vld1q_u32(d);
896
897 // load 4 elements from s
898 s_32x4 = vld1q_u32(s);
899 s_8x16 = vreinterpretq_u8_u32(s_32x4);
900 s0_8x8 = vget_low_u8(s_8x16);
901 s1_8x8 = vget_high_u8(s_8x16);
902
903 // calculate 1 + (*d >> 24)
904 c_32x4 = vshrq_n_u32(d_32x4, 24);
905 c_32x4 = vmulq_u32(x1_32x4, c_32x4);
906 c_8x16 = vreinterpretq_u8_u32(c_32x4);
907 c_8x16 = vaddq_u8(c_8x16, x1_8x16);
908 c0_8x8 = vget_low_u8(c_8x16);
909 c1_8x8 = vget_high_u8(c_8x16);
910
911 // multiply MUL_256(l, *d)
912 cs0_16x8 = vmull_u8(c0_8x8, s0_8x8);
913 cs1_16x8 = vmull_u8(c1_8x8, s1_8x8);
914 cs0_8x8 = vshrn_n_u16(cs0_16x8,8);
915 cs1_8x8 = vshrn_n_u16(cs1_16x8,8);
916 cs_8x16 = vcombine_u8(cs0_8x8, cs1_8x8);
917 cs_32x4 = vreinterpretq_u32_u8(cs_8x16);
918
919 // select s where c should be 256
920 c_32x4 = vreinterpretq_u32_u8(c_8x16);
921 cond_32x4 = vceqq_u32(c_32x4, x0_32x4);
922 cs_32x4 = vbslq_u32(cond_32x4, s_32x4, cs_32x4);
923
924 // save result
925 vst1q_u32(d, cs_32x4);
926
927 d+=4;
928 s+=4;
929 }
930
931 end += (l & 3);
932 while (d < end)
933 {
764 c = 1 + (*d >> 24); 934 c = 1 + (*d >> 24);
765 *d++ = MUL_256(c, *s); 935 *d++ = MUL_256(c, *s);
766 s++; 936 s++;