From f18d9d7237b6415735d0faae27ab817732047265 Mon Sep 17 00:00:00 2001 From: "Carsten Haitzler (Rasterman)" Date: Sun, 6 Nov 2016 12:30:13 +0900 Subject: [PATCH] remove memcmp calls for better performance where size is known so i have been doing some profiling on my rpi3 ... and it seems memcmp() is like the number one top used function - especially running e in wayland compositor mode. it uses accoring to perf top about 9-15% of samples (samples are not adding up to 100%). no - i cant seem to get a call graph because all that happens is the whole kernel locks up solid if i try, so i can only get the leaf node call stats. what function was currently active at the sample time. memcmp is the biggest by far. 2-3 times anything else. 13.47% libarmmem.so [.] memcmp 6.43% libevas.so.1.18.99 [.] _evas_render_phase1_object_pro 4.74% libevas.so.1.18.99 [.] evas_render_updates_internal.c 2.84% libeo.so.1.18.99 [.] _eo_obj_pointer_get 2.49% libevas.so.1.18.99 [.] evas_render_updates_internal_l 2.03% libpthread-2.24.so [.] pthread_getspecific 1.61% libeo.so.1.18.99 [.] efl_data_scope_get 1.60% libevas.so.1.18.99 [.] _evas_event_object_list_raw_in 1.54% libevas.so.1.18.99 [.] evas_object_smart_changed_get 1.32% libgcc_s.so.1 [.] __udivsi3 1.21% libevas.so.1.18.99 [.] evas_object_is_active 1.14% libc-2.24.so [.] malloc 0.96% libevas.so.1.18.99 [.] evas_render_mapped 0.85% libeo.so.1.18.99 [.] efl_isa yeah. it's perf. it's sampling so not 100% accurate, but close to "good enough" for the bigger stuff. so interestingly memcmp() is actually in a special library/module (libarmmem.so) and is a REAL function call. so doing memcmp's for small bits of memory ESPECIALLY when we know their size in advance is not great. i am not sure our own use of memcmp() is the actual culprit because even with this patch memcmp still is right up there. we use it for stringshare which is harder to remove as stringshare has variable sized memory blobs to compare. but the point remains - memcmp() is an ACTUAL function call. even on x86 (i checked the assembly). and replacing it with a static inline custom comparer is better. in fact i did that and benchmarked it as a sample case for eina_tiler which has 4 ints (16 bytes) to compare every time. i also compiled to assembly on x86 to inspect and make sure things made sense. the text color compare was just comparing 4 bytes as a color (an int worth) which was silly to use memcmp on as it could just cast to an int and do a == b. the map was a little more evil as it was 2 ptrs plus 2 bitfields, but the way bitfields work means i can assume the last byte is both bitfields combined. i can be a little more evil for the rect tests as 4 ints compared is the same as comparing 2 long longs (64bit types). yes. don't get pedantic. all platforms efl works on work this way and this is a base assumption in efl and it's true everywhere worth talking about. yes - i tried __int128 too. it was not faster on x86 anyway and can't compile on armv7. in my speed tests on x86-64, comparing 2 rects by casting to a struct of 2 long long's and comparing just those is 70% faster than comapring 4 ints. and the 2 long longs is 360% faster than a memcmp. on arm (my rpi3) the long long is 12% faster than the 4 ints, and it is 226% faster than a memcmp(). it'd be best if we didnt even have to compare at all, but with these algorithms we do, so doing it faster is better. we probably should nuke all the memcmp's we have that are not of large bits of memory or variable sized bits of memory. i set breakpoints for memcmp and found at least a chunk in efl. but also it seems the vc4 driver was also doing it too. i have no idea how much memory it was doing this to and it may ultimately be the biggest culprit here, BUT we may as well reduce our overhead since i've found this anyway. less "false positives" when hunting problems. why am i doing this? i'm setting framerate hiccups. eg like we drop 3, 5 or 10 frames, then drop another bunch, then go back to smooth, then this hiccup again. finding out WHAT is causing that hiccup is hard. i can only SEE the hiccups on my rpi3 - not on x86. i am not so sure it's cpufreq bouncing about as i've locked cpu to 600mhz and it still happens. it's something else. maybe something we are polling? maybe it's something in our drm/kms backend? maybe its in the vc4 drivers or kernel parts? i have no idea. trying to hunt this is hard, but this is important as this is something that possibly is affecting everyone but other hw is fast enough to hide it... in the meantime find and optimize what i find along the way. @optimize --- src/lib/eina/eina_tiler.c | 23 ++++++++++++++++++----- src/lib/evas/canvas/evas_object_main.c | 17 ++++++++++++++++- src/lib/evas/canvas/evas_object_text.c | 18 ++++++++++++++---- src/lib/evas/include/evas_private.h | 2 ++ 4 files changed, 50 insertions(+), 10 deletions(-) diff --git a/src/lib/eina/eina_tiler.c b/src/lib/eina/eina_tiler.c index e8abcdc97b..938bca7b25 100644 --- a/src/lib/eina/eina_tiler.c +++ b/src/lib/eina/eina_tiler.c @@ -1202,13 +1202,26 @@ EAPI Eina_Bool eina_tiler_empty(Eina_Tiler *t) { EINA_MAGIC_CHECK_TILER(t, EINA_TRUE); + return ((!t->splitter.rects.head) && (!t->splitter.rects.tail)); +} - return !memcmp(&t->splitter.rects, &list_zeroed, sizeof (list_t)); +typedef struct _Rectangle_Same +{ + unsigned long long x, y; +} Rectangle_Same; + +static inline Eina_Bool +_rect_same(Eina_Rectangle *rec1, Eina_Rectangle *rec2) +{ + // this is ok because all the rects being compared will be aligned to 8bytes + Rectangle_Same *same1 = (Rectangle_Same *)rec1; + Rectangle_Same *same2 = (Rectangle_Same *)rec2; + return ((same1->x == same2->y) && (same1->y == same2->y)); } EAPI Eina_Bool eina_tiler_rect_add(Eina_Tiler *t, const Eina_Rectangle *r) { - Eina_Rectangle tmp; + Eina_Rectangle tmp __attribute__ ((aligned (8))); EINA_MAGIC_CHECK_TILER(t, EINA_FALSE); EINA_SAFETY_ON_NULL_RETURN_VAL(r, EINA_FALSE); @@ -1223,7 +1236,7 @@ EAPI Eina_Bool eina_tiler_rect_add(Eina_Tiler *t, const Eina_Rectangle *r) if ((tmp.w <= 0) || (tmp.h <= 0)) return EINA_FALSE; - if (!memcmp(&tmp, &t->last.add, sizeof (Eina_Rectangle))) + if (_rect_same(&tmp, &t->last.add)) return EINA_FALSE; t->last.add = tmp; @@ -1234,7 +1247,7 @@ EAPI Eina_Bool eina_tiler_rect_add(Eina_Tiler *t, const Eina_Rectangle *r) EAPI void eina_tiler_rect_del(Eina_Tiler *t, const Eina_Rectangle *r) { - Eina_Rectangle tmp; + Eina_Rectangle tmp __attribute__ ((aligned (8))); EINA_MAGIC_CHECK_TILER(t); EINA_SAFETY_ON_NULL_RETURN(r); @@ -1249,7 +1262,7 @@ EAPI void eina_tiler_rect_del(Eina_Tiler *t, const Eina_Rectangle *r) if ((tmp.w <= 0) || (tmp.h <= 0)) return; - if (!memcmp(&tmp, &t->last.del, sizeof (Eina_Rectangle))) + if (_rect_same(&tmp, &t->last.del)) return; t->last.del = tmp; diff --git a/src/lib/evas/canvas/evas_object_main.c b/src/lib/evas/canvas/evas_object_main.c index 20b7798339..76c3c2bf66 100644 --- a/src/lib/evas/canvas/evas_object_main.c +++ b/src/lib/evas/canvas/evas_object_main.c @@ -128,6 +128,21 @@ evas_object_change_reset(Evas_Object *eo_obj) obj->need_surface_clear = EINA_FALSE; } +typedef struct _Map_Same +{ + void *p1, *p2; + Eina_Bool b1; +} Map_Same; + +static inline Eina_Bool +_map_same(const void *map1, const void *map2) +{ + const Map_Same *same1 = map1, *same2 = map2; + return ((same1->p1 == same2->p1) && + (same1->p2 == same2->p2) && + (same1->b1 == same2->b1)); +} + void evas_object_cur_prev(Evas_Object *eo_obj) { @@ -155,7 +170,7 @@ evas_object_cur_prev(Evas_Object *eo_obj) } EINA_COW_WRITE_END(evas_object_map_cow, obj->map, map_write); } - if (memcmp(&obj->map->prev, &obj->map->cur, sizeof (obj->map->cur))) + if (!_map_same(&obj->map->prev, &obj->map->cur)) { EINA_COW_WRITE_BEGIN(evas_object_map_cow, obj->map, Evas_Object_Map_Data, map_write) map_write->prev = map_write->cur; diff --git a/src/lib/evas/canvas/evas_object_text.c b/src/lib/evas/canvas/evas_object_text.c index 4aa29c82a2..2e8c2f7d0d 100644 --- a/src/lib/evas/canvas/evas_object_text.c +++ b/src/lib/evas/canvas/evas_object_text.c @@ -33,6 +33,9 @@ struct _Evas_Text_Data DATA32 magic; struct { + // WARNING - you cannot change the below outline/shadow etc. members + // and their content without also updating _color_same() in this + // file struct { unsigned char r, g, b, a; } outline, shadow, glow, glow2; @@ -192,6 +195,13 @@ _evas_object_text_item_del(Evas_Text_Data *o, Evas_Object_Text_Item *it) free(it); } +static inline Eina_Bool +_color_same(const void *col1, const void *col2) +{ + const unsigned int *icol1 = col1, *icol2 = col2; + return (*icol1 == *icol2); +} + static void _evas_object_text_items_clean(Evas_Object_Protected_Data *obj, Evas_Text_Data *o) { @@ -199,10 +209,10 @@ _evas_object_text_items_clean(Evas_Object_Protected_Data *obj, Evas_Text_Data *o if ((o->cur.font == o->prev.font) && (o->cur.fdesc == o->prev.fdesc) && (o->cur.size == o->prev.size) && - (!memcmp(&o->cur.outline, &o->prev.outline, sizeof (o->cur.outline))) && - (!memcmp(&o->cur.shadow, &o->prev.shadow, sizeof (o->cur.shadow))) && - (!memcmp(&o->cur.glow, &o->prev.glow, sizeof (o->cur.glow))) && - (!memcmp(&o->cur.glow2, &o->prev.glow2, sizeof (o->cur.glow2))) && + (_color_same(&o->cur.outline, &o->prev.outline)) && + (_color_same(&o->cur.shadow, &o->prev.shadow)) && + (_color_same(&o->cur.glow, &o->prev.glow)) && + (_color_same(&o->cur.glow2, &o->prev.glow2)) && (o->cur.style == o->prev.style) && (obj->cur->scale == obj->prev->scale)) { diff --git a/src/lib/evas/include/evas_private.h b/src/lib/evas/include/evas_private.h index 6b0ff6862c..5271ac5805 100644 --- a/src/lib/evas/include/evas_private.h +++ b/src/lib/evas/include/evas_private.h @@ -1002,6 +1002,8 @@ struct _Evas_Object_Proxy_Data struct _Evas_Object_Map_Data { + // WARNING - you cannot change the below cur/prev layout, content or size + // unless you also update evas_object_main.c _map_same() func struct { Evas_Map *map; Evas_Object *map_parent;