utf8 dict fixes/working code from florian

SVN revision: 38274
2008-12-22 00:23:41 +00:00 · 2008-12-22 00:23:41 +00:00 · 4a36706647
parent 88e091d3ba
commit 4a36706647
2 changed files with 74 additions and 12 deletions
--- a/1
+++ b/1
@ -21,3 +21,4 @@ morlenxus (Brian Miculcy) <morlenxus@gmx.net>
 Toma- (Tom Haste) <tomhaste@gmail.com>
 k-s (Gustavo Sverzut Barbieri) <barbieri@profusion.mobi>
 Peter van de Werken <pwerken-e@a-eskwadraat.nl>
+Florian Hackenberger <florian@hackenberger.at>
--- a/src/modules/illume/e_kbd_dict.c
+++ b/src/modules/illume/e_kbd_dict.c
@ -6,8 +6,13 @@
 #include <fcntl.h>
 #include <unistd.h>
 #include <sys/mman.h>
+#define _GNU_SOURCE
+#include <wchar.h>
+#include <wctype.h>

-
+/** A lookup table for normalising strings for dictionary lookups
+ * We currently limit the normalisation to characters in the latin1 charset.
+ */
 #define MAXLATIN 0x100
 static unsigned char _e_kbd_normalise_base[MAXLATIN];
 static unsigned char _e_kbd_normalise_ready = 0;
@ -85,16 +90,16 @@ _e_kbd_normalise_init(void)
   
   if (_e_kbd_normalise_ready) return;
   _e_kbd_normalise_ready = 1;
-   for (i = 0; i < 128; i++)
+   for (i = 0; i < 128; i++) // The 7-bit asci characters map to their lower case
     _e_kbd_normalise_base[i] = tolower(i);
-   for (;i < MAXLATIN; i++)
+   for (;i < MAXLATIN; i++) // Map the rest of the latin1 charset according to the table above
     {
 	int glyph;
 	int j;
 	
-	for (j = 0; j < 63; j++)
+	for (j = 0; j < 63; j++) // Iterate over the table
 	  {
-	     evas_string_char_next_get(table[j][0], 0, &glyph);
+	     evas_string_char_next_get(table[j][0], 0, &glyph); // Decode a multi byte UTF8 string
 	     if (glyph == i)
 	       {
 		  _e_kbd_normalise_base[i] = *table[j][1];
@ -104,20 +109,47 @@ _e_kbd_normalise_init(void)
     }
 }

+/** Normalise a wide character according to a normalisation mapping (e.g. ü -> u) */
 static int
 _e_kbd_dict_letter_normalise(int glyph)
 {
-   // FIXME: ö -> o, ä -> a, Ó -> o etc. - ie normalise to latin-1
-   if (glyph < MAXLATIN) return _e_kbd_normalise_base[glyph];
-   return tolower(glyph) & 0x7f;
+   if (glyph > 0 && glyph < MAXLATIN) return _e_kbd_normalise_base[glyph];
+   return towlower(glyph);
 }

+/** Normalise a wide character string according to a normalisation mapping (e.g. ü -> u) */
+static void _e_kbd_dict_string_normalise(wchar_t *str)
+{
+   while(*str) {
+      *str = _e_kbd_dict_letter_normalise(*str);
+      str++;
+   }
+}
+
+/** Normalise and compare two strings
+ *
+ * Normalise the string using _e_kbd_dict_string_normalise and then compare
+ * them in a case-insensitive way.
+ * @param a The first string
+ * @param b The second string
+ * @param Result according to strcasecmp(a, b) after normalisation
+ */
 static int
 _e_kbd_dict_normalized_strncmp(const char *a, const char *b, int len)
 {
-   // FIXME: normalise 2 strings and then compare
-   if (len < 0) return strcasecmp(a, b);
-   return strncasecmp(a, b, len);
+   mbstate_t shiftState; memset(&shiftState, 0, sizeof(mbstate_t));
+   // Calculate the size of the wchar buffer we will need to convert a and b (the number of codepoints in a/b)
+   size_t n_codep_a = len > 0 ? mbsnrtowcs(NULL, &a, len, 0, &shiftState) : mbsrtowcs(NULL, &a, 0, &shiftState);
+   size_t n_codep_b = len > 0 ? mbsnrtowcs(NULL, &a, len, 0, &shiftState) : mbsrtowcs(NULL, &a, 0, &shiftState);
+   wchar_t awc[n_codep_a+1]; awc[n_codep_a] = '\0';
+   wchar_t bwc[n_codep_b+1]; bwc[n_codep_a] = '\0';
+   // Convert a and b to wchar strings so we can nomalise them with the lookup table
+   len > 0 ? mbsnrtowcs(awc, &a, len, n_codep_a, &shiftState) : mbsrtowcs(awc, &a, n_codep_a, &shiftState);
+   len > 0 ? mbsnrtowcs(bwc, &b, len, n_codep_b, &shiftState) : mbsrtowcs(bwc, &b, n_codep_b, &shiftState);
+   _e_kbd_dict_string_normalise(awc);
+   _e_kbd_dict_string_normalise(bwc);
+   if(len > 0) return wcsncasecmp(awc, bwc, n_codep_a > n_codep_b ? n_codep_b : n_codep_a);
+   return wcscasecmp(awc, bwc);
 }

 static int
@ -126,6 +158,7 @@ _e_kbd_dict_normalized_strcmp(const char *a, const char *b)
   return _e_kbd_dict_normalized_strncmp(a, b, -1);
 }

+// FIXME: Does not support multi byte UTF8, does it?
 static void
 _e_kbd_dict_normalized_strcpy(char *dst, const char *src)
 {
@ -208,7 +241,14 @@ _e_kbd_dict_lookup_build_line(E_Kbd_Dict *kd, const char *p, const char *eol,
   s[eol - p] = 0;
   p2 = evas_string_char_next_get(s, 0, &(glyphs[0]));
   if ((p2 > 0) && (glyphs[0] > 0))
-     p2 = evas_string_char_next_get(s, p2, &(glyphs[1]));
+   {
+      glyphs[0] = _e_kbd_dict_letter_normalise(glyphs[0]);
+      p2 = evas_string_char_next_get(s, p2, &(glyphs[1]));
+      if ((p2 > 0) && (glyphs[1] > 0))
+      {
+         glyphs[1] = _e_kbd_dict_letter_normalise(glyphs[1]);
+      }
+   }
 }

 static void
@ -522,6 +562,27 @@ _e_kbd_dict_find(E_Kbd_Dict *kd, const char *word)
    */
   tword = alloca(strlen(word) + 1);
   _e_kbd_dict_normalized_strcpy(tword, word);
+
+/*
+   printf("search: %s\n", word);
+   // Convert word to wide character and normalise it
+   wchar_t *wtword;
+   mbstate_t shiftState; memset(&shiftState, 0, sizeof(mbstate_t));
+   size_t n_codep = mbsrtowcs(NULL, &word, 0, &shiftState);
+   printf("cp: %d\n", n_codep);
+   wtword = alloca(n_codep + 1);
+   wtword[n_codep] = '\0';
+   mbsrtowcs(wtword, &word, n_codep, &shiftState);
+   _e_kbd_dict_string_normalise(wtword);
+   printf("wchar: %ls\n", wtword);
+   // Convert it back to multi byte string
+   n_codep = wcsrtombs(NULL, (const wchar_t**)&wtword, 0, &shiftState);
+   printf("cp: %d\n", n_codep);
+   tword = alloca(n_codep + 1);
+   tword[n_codep] = '\0';
+   wcsrtombs(tword, (const wchar_t**)&wtword, n_codep, &shiftState);
+   printf("after conv: %s\n", tword);
+*/  
   p = eina_hash_find(kd->matches.leads, tword);
   if (p) return p;
   p2 = strlen(tword);