From 4a367066475915fde35133dce110356134290da2 Mon Sep 17 00:00:00 2001 From: Carsten Haitzler Date: Mon, 22 Dec 2008 00:23:41 +0000 Subject: [PATCH] utf8 dict fixes/working code from florian SVN revision: 38274 --- AUTHORS | 1 + src/modules/illume/e_kbd_dict.c | 85 ++++++++++++++++++++++++++++----- 2 files changed, 74 insertions(+), 12 deletions(-) diff --git a/AUTHORS b/AUTHORS index bbedc8149..9934e8400 100644 --- a/AUTHORS +++ b/AUTHORS @@ -21,3 +21,4 @@ morlenxus (Brian Miculcy) Toma- (Tom Haste) k-s (Gustavo Sverzut Barbieri) Peter van de Werken +Florian Hackenberger diff --git a/src/modules/illume/e_kbd_dict.c b/src/modules/illume/e_kbd_dict.c index d7be8d2da..3c7586465 100644 --- a/src/modules/illume/e_kbd_dict.c +++ b/src/modules/illume/e_kbd_dict.c @@ -6,8 +6,13 @@ #include #include #include +#define _GNU_SOURCE +#include +#include - +/** A lookup table for normalising strings for dictionary lookups + * We currently limit the normalisation to characters in the latin1 charset. + */ #define MAXLATIN 0x100 static unsigned char _e_kbd_normalise_base[MAXLATIN]; static unsigned char _e_kbd_normalise_ready = 0; @@ -85,16 +90,16 @@ _e_kbd_normalise_init(void) if (_e_kbd_normalise_ready) return; _e_kbd_normalise_ready = 1; - for (i = 0; i < 128; i++) + for (i = 0; i < 128; i++) // The 7-bit asci characters map to their lower case _e_kbd_normalise_base[i] = tolower(i); - for (;i < MAXLATIN; i++) + for (;i < MAXLATIN; i++) // Map the rest of the latin1 charset according to the table above { int glyph; int j; - for (j = 0; j < 63; j++) + for (j = 0; j < 63; j++) // Iterate over the table { - evas_string_char_next_get(table[j][0], 0, &glyph); + evas_string_char_next_get(table[j][0], 0, &glyph); // Decode a multi byte UTF8 string if (glyph == i) { _e_kbd_normalise_base[i] = *table[j][1]; @@ -104,20 +109,47 @@ _e_kbd_normalise_init(void) } } +/** Normalise a wide character according to a normalisation mapping (e.g. ü -> u) */ static int _e_kbd_dict_letter_normalise(int glyph) { - // FIXME: ö -> o, ä -> a, Ó -> o etc. - ie normalise to latin-1 - if (glyph < MAXLATIN) return _e_kbd_normalise_base[glyph]; - return tolower(glyph) & 0x7f; + if (glyph > 0 && glyph < MAXLATIN) return _e_kbd_normalise_base[glyph]; + return towlower(glyph); } +/** Normalise a wide character string according to a normalisation mapping (e.g. ü -> u) */ +static void _e_kbd_dict_string_normalise(wchar_t *str) +{ + while(*str) { + *str = _e_kbd_dict_letter_normalise(*str); + str++; + } +} + +/** Normalise and compare two strings + * + * Normalise the string using _e_kbd_dict_string_normalise and then compare + * them in a case-insensitive way. + * @param a The first string + * @param b The second string + * @param Result according to strcasecmp(a, b) after normalisation + */ static int _e_kbd_dict_normalized_strncmp(const char *a, const char *b, int len) { - // FIXME: normalise 2 strings and then compare - if (len < 0) return strcasecmp(a, b); - return strncasecmp(a, b, len); + mbstate_t shiftState; memset(&shiftState, 0, sizeof(mbstate_t)); + // Calculate the size of the wchar buffer we will need to convert a and b (the number of codepoints in a/b) + size_t n_codep_a = len > 0 ? mbsnrtowcs(NULL, &a, len, 0, &shiftState) : mbsrtowcs(NULL, &a, 0, &shiftState); + size_t n_codep_b = len > 0 ? mbsnrtowcs(NULL, &a, len, 0, &shiftState) : mbsrtowcs(NULL, &a, 0, &shiftState); + wchar_t awc[n_codep_a+1]; awc[n_codep_a] = '\0'; + wchar_t bwc[n_codep_b+1]; bwc[n_codep_a] = '\0'; + // Convert a and b to wchar strings so we can nomalise them with the lookup table + len > 0 ? mbsnrtowcs(awc, &a, len, n_codep_a, &shiftState) : mbsrtowcs(awc, &a, n_codep_a, &shiftState); + len > 0 ? mbsnrtowcs(bwc, &b, len, n_codep_b, &shiftState) : mbsrtowcs(bwc, &b, n_codep_b, &shiftState); + _e_kbd_dict_string_normalise(awc); + _e_kbd_dict_string_normalise(bwc); + if(len > 0) return wcsncasecmp(awc, bwc, n_codep_a > n_codep_b ? n_codep_b : n_codep_a); + return wcscasecmp(awc, bwc); } static int @@ -126,6 +158,7 @@ _e_kbd_dict_normalized_strcmp(const char *a, const char *b) return _e_kbd_dict_normalized_strncmp(a, b, -1); } +// FIXME: Does not support multi byte UTF8, does it? static void _e_kbd_dict_normalized_strcpy(char *dst, const char *src) { @@ -208,7 +241,14 @@ _e_kbd_dict_lookup_build_line(E_Kbd_Dict *kd, const char *p, const char *eol, s[eol - p] = 0; p2 = evas_string_char_next_get(s, 0, &(glyphs[0])); if ((p2 > 0) && (glyphs[0] > 0)) - p2 = evas_string_char_next_get(s, p2, &(glyphs[1])); + { + glyphs[0] = _e_kbd_dict_letter_normalise(glyphs[0]); + p2 = evas_string_char_next_get(s, p2, &(glyphs[1])); + if ((p2 > 0) && (glyphs[1] > 0)) + { + glyphs[1] = _e_kbd_dict_letter_normalise(glyphs[1]); + } + } } static void @@ -522,6 +562,27 @@ _e_kbd_dict_find(E_Kbd_Dict *kd, const char *word) */ tword = alloca(strlen(word) + 1); _e_kbd_dict_normalized_strcpy(tword, word); + +/* + printf("search: %s\n", word); + // Convert word to wide character and normalise it + wchar_t *wtword; + mbstate_t shiftState; memset(&shiftState, 0, sizeof(mbstate_t)); + size_t n_codep = mbsrtowcs(NULL, &word, 0, &shiftState); + printf("cp: %d\n", n_codep); + wtword = alloca(n_codep + 1); + wtword[n_codep] = '\0'; + mbsrtowcs(wtword, &word, n_codep, &shiftState); + _e_kbd_dict_string_normalise(wtword); + printf("wchar: %ls\n", wtword); + // Convert it back to multi byte string + n_codep = wcsrtombs(NULL, (const wchar_t**)&wtword, 0, &shiftState); + printf("cp: %d\n", n_codep); + tword = alloca(n_codep + 1); + tword[n_codep] = '\0'; + wcsrtombs(tword, (const wchar_t**)&wtword, n_codep, &shiftState); + printf("after conv: %s\n", tword); +*/ p = eina_hash_find(kd->matches.leads, tword); if (p) return p; p2 = strlen(tword);