forked from enlightenment/enlightenment
utf8 dict fixes/working code from florian
SVN revision: 38274
This commit is contained in:
parent
88e091d3ba
commit
4a36706647
1
AUTHORS
1
AUTHORS
|
@ -21,3 +21,4 @@ morlenxus (Brian Miculcy) <morlenxus@gmx.net>
|
|||
Toma- (Tom Haste) <tomhaste@gmail.com>
|
||||
k-s (Gustavo Sverzut Barbieri) <barbieri@profusion.mobi>
|
||||
Peter van de Werken <pwerken-e@a-eskwadraat.nl>
|
||||
Florian Hackenberger <florian@hackenberger.at>
|
||||
|
|
|
@ -6,8 +6,13 @@
|
|||
#include <fcntl.h>
|
||||
#include <unistd.h>
|
||||
#include <sys/mman.h>
|
||||
#define _GNU_SOURCE
|
||||
#include <wchar.h>
|
||||
#include <wctype.h>
|
||||
|
||||
|
||||
/** A lookup table for normalising strings for dictionary lookups
|
||||
* We currently limit the normalisation to characters in the latin1 charset.
|
||||
*/
|
||||
#define MAXLATIN 0x100
|
||||
static unsigned char _e_kbd_normalise_base[MAXLATIN];
|
||||
static unsigned char _e_kbd_normalise_ready = 0;
|
||||
|
@ -85,16 +90,16 @@ _e_kbd_normalise_init(void)
|
|||
|
||||
if (_e_kbd_normalise_ready) return;
|
||||
_e_kbd_normalise_ready = 1;
|
||||
for (i = 0; i < 128; i++)
|
||||
for (i = 0; i < 128; i++) // The 7-bit asci characters map to their lower case
|
||||
_e_kbd_normalise_base[i] = tolower(i);
|
||||
for (;i < MAXLATIN; i++)
|
||||
for (;i < MAXLATIN; i++) // Map the rest of the latin1 charset according to the table above
|
||||
{
|
||||
int glyph;
|
||||
int j;
|
||||
|
||||
for (j = 0; j < 63; j++)
|
||||
for (j = 0; j < 63; j++) // Iterate over the table
|
||||
{
|
||||
evas_string_char_next_get(table[j][0], 0, &glyph);
|
||||
evas_string_char_next_get(table[j][0], 0, &glyph); // Decode a multi byte UTF8 string
|
||||
if (glyph == i)
|
||||
{
|
||||
_e_kbd_normalise_base[i] = *table[j][1];
|
||||
|
@ -104,20 +109,47 @@ _e_kbd_normalise_init(void)
|
|||
}
|
||||
}
|
||||
|
||||
/** Normalise a wide character according to a normalisation mapping (e.g. ü -> u) */
|
||||
static int
|
||||
_e_kbd_dict_letter_normalise(int glyph)
|
||||
{
|
||||
// FIXME: ö -> o, ä -> a, Ó -> o etc. - ie normalise to latin-1
|
||||
if (glyph < MAXLATIN) return _e_kbd_normalise_base[glyph];
|
||||
return tolower(glyph) & 0x7f;
|
||||
if (glyph > 0 && glyph < MAXLATIN) return _e_kbd_normalise_base[glyph];
|
||||
return towlower(glyph);
|
||||
}
|
||||
|
||||
/** Normalise a wide character string according to a normalisation mapping (e.g. ü -> u) */
|
||||
static void _e_kbd_dict_string_normalise(wchar_t *str)
|
||||
{
|
||||
while(*str) {
|
||||
*str = _e_kbd_dict_letter_normalise(*str);
|
||||
str++;
|
||||
}
|
||||
}
|
||||
|
||||
/** Normalise and compare two strings
|
||||
*
|
||||
* Normalise the string using _e_kbd_dict_string_normalise and then compare
|
||||
* them in a case-insensitive way.
|
||||
* @param a The first string
|
||||
* @param b The second string
|
||||
* @param Result according to strcasecmp(a, b) after normalisation
|
||||
*/
|
||||
static int
|
||||
_e_kbd_dict_normalized_strncmp(const char *a, const char *b, int len)
|
||||
{
|
||||
// FIXME: normalise 2 strings and then compare
|
||||
if (len < 0) return strcasecmp(a, b);
|
||||
return strncasecmp(a, b, len);
|
||||
mbstate_t shiftState; memset(&shiftState, 0, sizeof(mbstate_t));
|
||||
// Calculate the size of the wchar buffer we will need to convert a and b (the number of codepoints in a/b)
|
||||
size_t n_codep_a = len > 0 ? mbsnrtowcs(NULL, &a, len, 0, &shiftState) : mbsrtowcs(NULL, &a, 0, &shiftState);
|
||||
size_t n_codep_b = len > 0 ? mbsnrtowcs(NULL, &a, len, 0, &shiftState) : mbsrtowcs(NULL, &a, 0, &shiftState);
|
||||
wchar_t awc[n_codep_a+1]; awc[n_codep_a] = '\0';
|
||||
wchar_t bwc[n_codep_b+1]; bwc[n_codep_a] = '\0';
|
||||
// Convert a and b to wchar strings so we can nomalise them with the lookup table
|
||||
len > 0 ? mbsnrtowcs(awc, &a, len, n_codep_a, &shiftState) : mbsrtowcs(awc, &a, n_codep_a, &shiftState);
|
||||
len > 0 ? mbsnrtowcs(bwc, &b, len, n_codep_b, &shiftState) : mbsrtowcs(bwc, &b, n_codep_b, &shiftState);
|
||||
_e_kbd_dict_string_normalise(awc);
|
||||
_e_kbd_dict_string_normalise(bwc);
|
||||
if(len > 0) return wcsncasecmp(awc, bwc, n_codep_a > n_codep_b ? n_codep_b : n_codep_a);
|
||||
return wcscasecmp(awc, bwc);
|
||||
}
|
||||
|
||||
static int
|
||||
|
@ -126,6 +158,7 @@ _e_kbd_dict_normalized_strcmp(const char *a, const char *b)
|
|||
return _e_kbd_dict_normalized_strncmp(a, b, -1);
|
||||
}
|
||||
|
||||
// FIXME: Does not support multi byte UTF8, does it?
|
||||
static void
|
||||
_e_kbd_dict_normalized_strcpy(char *dst, const char *src)
|
||||
{
|
||||
|
@ -208,7 +241,14 @@ _e_kbd_dict_lookup_build_line(E_Kbd_Dict *kd, const char *p, const char *eol,
|
|||
s[eol - p] = 0;
|
||||
p2 = evas_string_char_next_get(s, 0, &(glyphs[0]));
|
||||
if ((p2 > 0) && (glyphs[0] > 0))
|
||||
p2 = evas_string_char_next_get(s, p2, &(glyphs[1]));
|
||||
{
|
||||
glyphs[0] = _e_kbd_dict_letter_normalise(glyphs[0]);
|
||||
p2 = evas_string_char_next_get(s, p2, &(glyphs[1]));
|
||||
if ((p2 > 0) && (glyphs[1] > 0))
|
||||
{
|
||||
glyphs[1] = _e_kbd_dict_letter_normalise(glyphs[1]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
|
@ -522,6 +562,27 @@ _e_kbd_dict_find(E_Kbd_Dict *kd, const char *word)
|
|||
*/
|
||||
tword = alloca(strlen(word) + 1);
|
||||
_e_kbd_dict_normalized_strcpy(tword, word);
|
||||
|
||||
/*
|
||||
printf("search: %s\n", word);
|
||||
// Convert word to wide character and normalise it
|
||||
wchar_t *wtword;
|
||||
mbstate_t shiftState; memset(&shiftState, 0, sizeof(mbstate_t));
|
||||
size_t n_codep = mbsrtowcs(NULL, &word, 0, &shiftState);
|
||||
printf("cp: %d\n", n_codep);
|
||||
wtword = alloca(n_codep + 1);
|
||||
wtword[n_codep] = '\0';
|
||||
mbsrtowcs(wtword, &word, n_codep, &shiftState);
|
||||
_e_kbd_dict_string_normalise(wtword);
|
||||
printf("wchar: %ls\n", wtword);
|
||||
// Convert it back to multi byte string
|
||||
n_codep = wcsrtombs(NULL, (const wchar_t**)&wtword, 0, &shiftState);
|
||||
printf("cp: %d\n", n_codep);
|
||||
tword = alloca(n_codep + 1);
|
||||
tword[n_codep] = '\0';
|
||||
wcsrtombs(tword, (const wchar_t**)&wtword, n_codep, &shiftState);
|
||||
printf("after conv: %s\n", tword);
|
||||
*/
|
||||
p = eina_hash_find(kd->matches.leads, tword);
|
||||
if (p) return p;
|
||||
p2 = strlen(tword);
|
||||
|
|
Loading…
Reference in New Issue