efl/src/lib/efl/interfaces/efl_text_markup_util.c

501 lines
14 KiB
C

#include "config.h"
#include "Efl.h"
#define MY_CLASS EFL_TEXT_MARKUP_UTIL_CLASS
#define ERR(...) EINA_LOG_DOM_ERR(EINA_LOG_DOMAIN_DEFAULT, __VA_ARGS__)
#define _REPLACEMENT_CHAR 0xFFFC
#define _PARAGRAPH_SEPARATOR 0x2029
#define _NEWLINE '\n'
#define _TAB '\t'
#define _REPLACEMENT_CHAR_UTF8 "\xEF\xBF\xBC"
#define _PARAGRAPH_SEPARATOR_UTF8 "\xE2\x80\xA9"
#define _NEWLINE_UTF8 "\n"
#define _TAB_UTF8 "\t"
#define EVAS_TEXTBLOCK_IS_VISIBLE_FORMAT_CHAR(ch) \
(((ch) == _REPLACEMENT_CHAR) || \
((ch) == _NEWLINE) || \
((ch) == _TAB) || \
((ch) == _PARAGRAPH_SEPARATOR))
#define _IS_TAB(item) \
(!strcmp(item, "tab") || !strcmp(item, "\t") || !strcmp(item, "\\t"))
#define _IS_LINE_SEPARATOR(item) \
(!strcmp(item, "br") || !strcmp(item, "\n") || !strcmp(item, "\\n"))
#define _IS_PARAGRAPH_SEPARATOR_SIMPLE(item) \
(!strcmp(item, "ps"))
#define _IS_PARAGRAPH_SEPARATOR(o, item) \
(_IS_PARAGRAPH_SEPARATOR_SIMPLE(item) || \
(o->legacy_newline && _IS_LINE_SEPARATOR(item))) /* Paragraph separator */
static void
_markup_get_text_utf8_append(Eina_Strbuf *sbuf, const char *text)
{
int ch, pos = 0, pos2 = 0;
for (;;)
{
pos = pos2;
ch = eina_unicode_utf8_next_get(text, &pos2);
if ((ch <= 0) || (pos2 <= 0)) break;
if (ch == _NEWLINE)
eina_strbuf_append(sbuf, "<br/>");
else if (ch == _TAB)
eina_strbuf_append(sbuf, "<tab/>");
else if (ch == '<')
eina_strbuf_append(sbuf, "&lt;");
else if (ch == '>')
eina_strbuf_append(sbuf, "&gt;");
else if (ch == '&')
eina_strbuf_append(sbuf, "&amp;");
else if (ch == '"')
eina_strbuf_append(sbuf, "&quot;");
else if (ch == _PARAGRAPH_SEPARATOR)
eina_strbuf_append(sbuf, "<ps/>");
else if (ch == _REPLACEMENT_CHAR)
eina_strbuf_append(sbuf, "&#xfffc;");
else if (ch != '\r')
{
eina_strbuf_append_length(sbuf, text + pos, pos2 - pos);
}
}
}
EOLIAN static char*
_efl_text_markup_util_text_to_markup(Eo *class EINA_UNUSED,
void *_pd EINA_UNUSED, const char *text)
{
Eina_Strbuf *sbuf;
char *str = NULL;
if (!text) return NULL;
sbuf = eina_strbuf_new();
_markup_get_text_utf8_append(sbuf, text);
str = eina_strbuf_string_steal(sbuf);
eina_strbuf_free(sbuf);
return str;
}
/* table of html escapes (that i can find) this should be ordered with the
* most common first as it's a linear search to match - no hash for this.
*
* these are stored as one large string and one additional array that
* contains the offsets to the tokens for space efficiency.
*/
/**
* @internal
* @var escape_strings[]
* This string consists of NULL terminated pairs of strings, the first of
* every pair is an escape and the second is the value of the escape.
*/
static const char escape_strings[] =
/* most common escaped stuff */
"&quot;\0" "\x22\0"
"&amp;\0" "\x26\0"
"&apos;\0" "\x27\0"
"&lt;\0" "\x3c\0"
"&gt;\0" "\x3e\0"
/* all the rest */
"&nbsp;\0" "\xc2\xa0\0"
"&iexcl;\0" "\xc2\xa1\0"
"&cent;\0" "\xc2\xa2\0"
"&pound;\0" "\xc2\xa3\0"
"&curren;\0" "\xc2\xa4\0"
"&yen;\0" "\xc2\xa5\0"
"&brvbar;\0" "\xc2\xa6\0"
"&sect;\0" "\xc2\xa7\0"
"&uml;\0" "\xc2\xa8\0"
"&copy;\0" "\xc2\xa9\0"
"&ordf;\0" "\xc2\xaa\0"
"&laquo;\0" "\xc2\xab\0"
"&not;\0" "\xc2\xac\0"
"&shy;\0" "\xc2\xad\0"
"&reg;\0" "\xc2\xae\0"
"&macr;\0" "\xc2\xaf\0"
"&deg;\0" "\xc2\xb0\0"
"&plusmn;\0" "\xc2\xb1\0"
"&sup2;\0" "\xc2\xb2\0"
"&sup3;\0" "\xc2\xb3\0"
"&acute;\0" "\xc2\xb4\0"
"&micro;\0" "\xc2\xb5\0"
"&para;\0" "\xc2\xb6\0"
"&middot;\0" "\xc2\xb7\0"
"&cedil;\0" "\xc2\xb8\0"
"&sup1;\0" "\xc2\xb9\0"
"&ordm;\0" "\xc2\xba\0"
"&raquo;\0" "\xc2\xbb\0"
"&frac14;\0" "\xc2\xbc\0"
"&frac12;\0" "\xc2\xbd\0"
"&frac34;\0" "\xc2\xbe\0"
"&iquest;\0" "\xc2\xbf\0"
"&Agrave;\0" "\xc3\x80\0"
"&Aacute;\0" "\xc3\x81\0"
"&Acirc;\0" "\xc3\x82\0"
"&Atilde;\0" "\xc3\x83\0"
"&Auml;\0" "\xc3\x84\0"
"&Aring;\0" "\xc3\x85\0"
"&Aelig;\0" "\xc3\x86\0"
"&Ccedil;\0" "\xc3\x87\0"
"&Egrave;\0" "\xc3\x88\0"
"&Eacute;\0" "\xc3\x89\0"
"&Ecirc;\0" "\xc3\x8a\0"
"&Euml;\0" "\xc3\x8b\0"
"&Igrave;\0" "\xc3\x8c\0"
"&Iacute;\0" "\xc3\x8d\0"
"&Icirc;\0" "\xc3\x8e\0"
"&Iuml;\0" "\xc3\x8f\0"
"&Eth;\0" "\xc3\x90\0"
"&Ntilde;\0" "\xc3\x91\0"
"&Ograve;\0" "\xc3\x92\0"
"&Oacute;\0" "\xc3\x93\0"
"&Ocirc;\0" "\xc3\x94\0"
"&Otilde;\0" "\xc3\x95\0"
"&Ouml;\0" "\xc3\x96\0"
"&times;\0" "\xc3\x97\0"
"&Oslash;\0" "\xc3\x98\0"
"&Ugrave;\0" "\xc3\x99\0"
"&Uacute;\0" "\xc3\x9a\0"
"&Ucirc;\0" "\xc3\x9b\0"
"&Yacute;\0" "\xc3\x9d\0"
"&Thorn;\0" "\xc3\x9e\0"
"&szlig;\0" "\xc3\x9f\0"
"&agrave;\0" "\xc3\xa0\0"
"&aacute;\0" "\xc3\xa1\0"
"&acirc;\0" "\xc3\xa2\0"
"&atilde;\0" "\xc3\xa3\0"
"&auml;\0" "\xc3\xa4\0"
"&aring;\0" "\xc3\xa5\0"
"&aelig;\0" "\xc3\xa6\0"
"&ccedil;\0" "\xc3\xa7\0"
"&egrave;\0" "\xc3\xa8\0"
"&eacute;\0" "\xc3\xa9\0"
"&ecirc;\0" "\xc3\xaa\0"
"&euml;\0" "\xc3\xab\0"
"&igrave;\0" "\xc3\xac\0"
"&iacute;\0" "\xc3\xad\0"
"&icirc;\0" "\xc3\xae\0"
"&iuml;\0" "\xc3\xaf\0"
"&eth;\0" "\xc3\xb0\0"
"&ntilde;\0" "\xc3\xb1\0"
"&ograve;\0" "\xc3\xb2\0"
"&oacute;\0" "\xc3\xb3\0"
"&ocirc;\0" "\xc3\xb4\0"
"&otilde;\0" "\xc3\xb5\0"
"&ouml;\0" "\xc3\xb6\0"
"&divide;\0" "\xc3\xb7\0"
"&oslash;\0" "\xc3\xb8\0"
"&ugrave;\0" "\xc3\xb9\0"
"&uacute;\0" "\xc3\xba\0"
"&ucirc;\0" "\xc3\xbb\0"
"&uuml;\0" "\xc3\xbc\0"
"&yacute;\0" "\xc3\xbd\0"
"&thorn;\0" "\xc3\xbe\0"
"&yuml;\0" "\xc3\xbf\0"
"&alpha;\0" "\xce\x91\0"
"&beta;\0" "\xce\x92\0"
"&gamma;\0" "\xce\x93\0"
"&delta;\0" "\xce\x94\0"
"&epsilon;\0" "\xce\x95\0"
"&zeta;\0" "\xce\x96\0"
"&eta;\0" "\xce\x97\0"
"&theta;\0" "\xce\x98\0"
"&iota;\0" "\xce\x99\0"
"&kappa;\0" "\xce\x9a\0"
"&lambda;\0" "\xce\x9b\0"
"&mu;\0" "\xce\x9c\0"
"&nu;\0" "\xce\x9d\0"
"&xi;\0" "\xce\x9e\0"
"&omicron;\0" "\xce\x9f\0"
"&pi;\0" "\xce\xa0\0"
"&rho;\0" "\xce\xa1\0"
"&sigma;\0" "\xce\xa3\0"
"&tau;\0" "\xce\xa4\0"
"&upsilon;\0" "\xce\xa5\0"
"&phi;\0" "\xce\xa6\0"
"&chi;\0" "\xce\xa7\0"
"&psi;\0" "\xce\xa8\0"
"&omega;\0" "\xce\xa9\0"
"&hellip;\0" "\xe2\x80\xa6\0"
"&euro;\0" "\xe2\x82\xac\0"
"&larr;\0" "\xe2\x86\x90\0"
"&uarr;\0" "\xe2\x86\x91\0"
"&rarr;\0" "\xe2\x86\x92\0"
"&darr;\0" "\xe2\x86\x93\0"
"&harr;\0" "\xe2\x86\x94\0"
"&larr;\0" "\xe2\x87\x90\0"
"&rarr;\0" "\xe2\x87\x92\0"
"&forall;\0" "\xe2\x88\x80\0"
"&exist;\0" "\xe2\x88\x83\0"
"&nabla;\0" "\xe2\x88\x87\0"
"&prod;\0" "\xe2\x88\x8f\0"
"&sum;\0" "\xe2\x88\x91\0"
"&and;\0" "\xe2\x88\xa7\0"
"&or;\0" "\xe2\x88\xa8\0"
"&int;\0" "\xe2\x88\xab\0"
"&ne;\0" "\xe2\x89\xa0\0"
"&equiv;\0" "\xe2\x89\xa1\0"
"&oplus;\0" "\xe2\x8a\x95\0"
"&perp;\0" "\xe2\x8a\xa5\0"
"&dagger;\0" "\xe2\x80\xa0\0"
"&Dagger;\0" "\xe2\x80\xa1\0"
"&bull;\0" "\xe2\x80\xa2\0"
"&zwnj;\0" "\xe2\x80\x8c\0"
"&zwj;\0" "\xe2\x80\x8d\0"
"&lrm;\0" "\xe2\x80\x8e\0"
"&rlm;\0" "\xe2\x80\x8f\0"
;
static inline void
_escaped_advance_after_end_of_string(const char **p_buf)
{
while (**p_buf != 0) (*p_buf)++;
(*p_buf)++;
}
static inline int
_escaped_is_eq_and_advance(const char *s, const char *s_end,
const char **p_m, const char *m_end)
{
Eina_Bool reached_end;
for (;((s < s_end) && (*p_m < m_end)); s++, (*p_m)++)
{
if (*s != **p_m)
{
_escaped_advance_after_end_of_string(p_m);
return 0;
}
}
reached_end = !**p_m;
if (*p_m < m_end)
_escaped_advance_after_end_of_string(p_m);
return ((s == s_end) && reached_end);
}
static inline const char *
_escaped_char_get(const char *s, const char *s_end)
{
/* Handle numeric escape codes. */
if (s[1] == '#')
{
static char utf8_escape[7]; /* Support up to 6 bytes utf8 */
char ustr[10];
Eina_Unicode uchar[2] = { 0, 0 };
char *utf8_char;
size_t len = 0;
int base = 10;
s += 2; /* Skip "&#" */
if ((*s == 'x') && (*s == 'X'))
{
s++;
base = 16;
}
len = s_end - s;
if (len > sizeof(ustr))
len = sizeof(ustr);
memcpy(ustr, s, len);
ustr[len - 1] = '\0';
uchar[0] = strtol(ustr, NULL, base);
if (uchar[0] == 0)
return NULL;
utf8_char = eina_unicode_unicode_to_utf8(uchar, NULL);
// eina_unicode_unicode_to_utf8() always creates a string that
// is nul terminated - guaranteed
if (utf8_char)
{
strcpy(utf8_escape, utf8_char);
free(utf8_char);
}
return utf8_escape;
}
else
{
const char *map_itr, *map_end;
map_itr = escape_strings;
map_end = map_itr + sizeof(escape_strings);
while (map_itr < map_end)
{
if (_escaped_is_eq_and_advance(s, s_end, &map_itr, map_end))
return map_itr;
if (map_itr < map_end)
_escaped_advance_after_end_of_string(&map_itr);
}
}
return NULL;
}
static char *
_text_util_markup_to_text(const char *text)
{
Eina_Strbuf *sbuf;
char *s, *p, *ret;
char *tag_start, *tag_end, *esc_start, *esc_end;
if (!text) return NULL;
tag_start = tag_end = esc_start = esc_end = NULL;
sbuf = eina_strbuf_new();
p = (char *)text;
s = p;
/* This loop goes through all of the mark up text until it finds format
* tags, escape sequences or the terminating NULL. When it finds either
* of those, it appends the text found up until that point to the textblock
* proccesses whatever found. It repeats itself until the termainating
* NULL is reached. */
for (;;)
{
/* If we got to the end of string or just finished/started tag
* or escape sequence handling. */
if ((*p == 0) ||
(tag_end) || (esc_end) ||
(tag_start) || (esc_start))
{
if (tag_end)
{
/* If we reached to a tag ending, analyze the tag */
char *ttag;
size_t ttag_len;
tag_start++; /* Skip the < */
tag_end--; /* Skip the > */
if ((tag_end > tag_start) && (*(tag_end - 1) == '/'))
{
tag_end --; /* Skip the terminating '/' */
while (*(tag_end - 1) == ' ')
tag_end--; /* skip trailing ' ' */
}
ttag_len = tag_end - tag_start;
ttag = malloc(ttag_len + 1);
if (ttag)
{
memcpy(ttag, tag_start, ttag_len);
ttag[ttag_len] = 0;
if (_IS_PARAGRAPH_SEPARATOR_SIMPLE(ttag))
eina_strbuf_append(sbuf, _PARAGRAPH_SEPARATOR_UTF8);
else if (_IS_LINE_SEPARATOR(ttag))
eina_strbuf_append(sbuf, _NEWLINE_UTF8);
else if (_IS_TAB(ttag))
eina_strbuf_append(sbuf, _TAB_UTF8);
else if (!strncmp(ttag, "item", 4))
eina_strbuf_append(sbuf, _REPLACEMENT_CHAR_UTF8);
free(ttag);
}
tag_start = tag_end = NULL;
}
else if (esc_end)
{
const char *escape;
escape = _escaped_char_get(esc_start, esc_end + 1);
if (escape) eina_strbuf_append(sbuf, escape);
esc_start = esc_end = NULL;
}
else if (*p == 0)
{
if (s)
{
eina_strbuf_append_length(sbuf, s, p - s);
s = NULL;
}
else
{
ERR("There is a invalid markup tag at positoin '%u'. Please check the text.", (unsigned int) (p - text));
}
}
if (*p == 0)
break;
}
if (*p == '<')
{
if (!esc_start)
{
/* Append the text prior to this to the textblock and
* mark the start of the tag */
tag_start = p;
tag_end = NULL;
if (s)
{
eina_strbuf_append_length(sbuf, s, p - s);
s = NULL;
}
else
{
ERR("There is a invalid markup tag at positoin '%u'. Please check the text.", (unsigned int) (p - text));
}
}
}
else if (*p == '>')
{
if (tag_start)
{
tag_end = p + 1;
s = p + 1;
}
}
else if (*p == '&')
{
if (!tag_start)
{
/* Append the text prior to this to the textblock and mark
* the start of the escape sequence */
esc_start = p;
esc_end = NULL;
if (s)
{
eina_strbuf_append_length(sbuf, s, p - s);
s = NULL;
}
else
{
ERR("There is a invalid markup tag at positoin '%u'. Please check the text.", (unsigned int) (p - text));
}
}
}
else if (*p == ';')
{
if (esc_start)
{
esc_end = p;
s = p + 1;
}
}
p++;
}
ret = eina_strbuf_string_steal(sbuf);
eina_strbuf_free(sbuf);
return ret;
}
static EOLIAN char*
_efl_text_markup_util_markup_to_text(Eo *class EINA_UNUSED,
void *_pd EINA_UNUSED, const char *text)
{
return _text_util_markup_to_text(text);
}
#include "interfaces/efl_text_markup_util.eo.c"