enlightenment/src/modules/vkbd/e_kbd_dict.c

881 lines
23 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include "e.h"
#include "e_kbd_dict.h"
#include <fcntl.h>
#include <sys/mman.h>
#define MAXLATIN 0x100
static unsigned char _e_kbd_normalise_base[MAXLATIN];
static unsigned char _e_kbd_normalise_ready = 0;
static void
_e_kbd_normalise_init(void)
{
int i;
const char *table[][2] =
{
{"À", "a"},
{"Á", "a"},
{"Â", "a"},
{"Ã", "a"},
{"Ä", "a"},
{"Å", "a"},
{"Æ", "a"},
{"Ç", "c"},
{"È", "e"},
{"É", "e"},
{"Ê", "e"},
{"Ë", "e"},
{"Ì", "i"},
{"Í", "i"},
{"Î", "i"},
{"Ï", "i"},
{"Ð", "d"},
{"Ñ", "n"},
{"Ò", "o"},
{"Ó", "o"},
{"Ô", "o"},
{"Õ", "o"},
{"Ö", "o"},
{"×", "x"},
{"Ø", "o"},
{"Ù", "u"},
{"Ú", "u"},
{"Û", "u"},
{"Ü", "u"},
{"Ý", "y"},
{"Þ", "p"},
{"ß", "s"},
{"à", "a"},
{"á", "a"},
{"â", "a"},
{"ã", "a"},
{"ä", "a"},
{"å", "a"},
{"æ", "a"},
{"ç", "c"},
{"è", "e"},
{"é", "e"},
{"ê", "e"},
{"ë", "e"},
{"ì", "i"},
{"í", "i"},
{"î", "i"},
{"ï", "i"},
{"ð", "o"},
{"ñ", "n"},
{"ò", "o"},
{"ó", "o"},
{"ô", "o"},
{"õ", "o"},
{"ö", "o"},
{"ø", "o"},
{"ù", "u"},
{"ú", "u"},
{"û", "u"},
{"ü", "u"},
{"ý", "y"},
{"þ", "p"},
{"ÿ", "y"}
}; // 63 items
if (_e_kbd_normalise_ready) return;
_e_kbd_normalise_ready = 1;
for (i = 0; i < 128; i++)
_e_kbd_normalise_base[i] = tolower(i);
for (;i < MAXLATIN; i++)
{
int glyph, j;
for (j = 0; j < 63; j++)
{
evas_string_char_next_get(table[j][0], 0, &glyph);
if (glyph == i)
{
_e_kbd_normalise_base[i] = *table[j][1];
break;
}
}
}
}
static int
_e_kbd_dict_letter_normalise(int glyph)
{
// FIXME: ö -> o, ä -> a, Ó -> o etc. - ie normalise to latin-1
if (glyph < MAXLATIN) return _e_kbd_normalise_base[glyph];
return tolower(glyph) & 0x7f;
}
static int
_e_kbd_dict_normalized_strncmp(const char *a, const char *b, int len)
{
// FIXME: normalise 2 strings and then compare
if (len < 0) return strcasecmp(a, b);
return strncasecmp(a, b, len);
}
static int
_e_kbd_dict_normalized_strcmp(const char *a, const char *b)
{
return _e_kbd_dict_normalized_strncmp(a, b, -1);
}
static void
_e_kbd_dict_normalized_strcpy(char *dst, const char *src)
{
const char *p;
char *d;
for (p = src, d = dst; *p; p++, d++)
*d = _e_kbd_dict_letter_normalise(*p);
*d = 0;
}
static int
_e_kbd_dict_matches_lookup_cb_sort(const void *d1, const void *d2)
{
const E_Kbd_Dict_Word *kw1, *kw2;
kw1 = d1;
kw2 = d2;
if (kw1->usage < kw2->usage) return 1;
else if (kw1->usage > kw2->usage) return -1;
return 0;
}
static int
_e_kbd_dict_writes_cb_sort(const void *d1, const void *d2)
{
const E_Kbd_Dict_Word *kw1, *kw2;
kw1 = d1;
kw2 = d2;
return _e_kbd_dict_normalized_strcmp(kw1->word, kw2->word);
}
static const char *
_e_kbd_dict_line_next(E_Kbd_Dict *kd, const char *p)
{
const char *e, *pp;
e = kd->file.dict + kd->file.size;
for (pp = p; pp < e; pp++)
if (*pp == '\n') return pp + 1;
return NULL;
}
static char *
_e_kbd_dict_line_parse(E_Kbd_Dict *kd EINA_UNUSED, const char *p, int *usage)
{
const char *ps;
char *wd = NULL;
for (ps = p; !isspace(*ps); ps++);
wd = malloc(ps - p + 1);
if (!wd) return NULL;
strncpy(wd, p, ps - p);
wd[ps - p] = 0;
if (*ps == '\n') *usage = 0;
else
{
ps++;
*usage = atoi(ps);
}
return wd;
}
static void
_e_kbd_dict_lookup_build_line(E_Kbd_Dict *kd EINA_UNUSED, const char *p, const char *eol, int *glyphs)
{
char *s;
int p2;
s = alloca(eol - p + 1);
strncpy(s, p, eol - p);
s[eol - p] = 0;
p2 = evas_string_char_next_get(s, 0, &(glyphs[0]));
if ((p2 > 0) && (glyphs[0] > 0))
evas_string_char_next_get(s, p2, &(glyphs[1]));
}
static void
_e_kbd_dict_lookup_build(E_Kbd_Dict *kd)
{
const char *p, *e, *eol, *base;
int glyphs[2], pglyphs[2];
int i, j, line = 0;
p = base = kd->file.dict;
e = p + kd->file.size;
pglyphs[0] = pglyphs[1] = 0;
for (j = 0; j < 128; j++)
{
for (i = 0; i < 128; i++)
kd->lookup.tuples[j][i] = -1;
}
while (p < e)
{
eol = strchr(p, '\n');
if (!eol) break;
line++;
if ((p - base) >= 0x7fffffff)
{
ERR("DICT %s TOO BIG! must be < 2GB", kd->file.file);
return;
}
if (eol > p)
{
glyphs[0] = glyphs[1] = 0;
_e_kbd_dict_lookup_build_line(kd, p, eol, glyphs);
if ((glyphs[1] != pglyphs[1]) || (glyphs[0] != pglyphs[0]))
{
int v1, v2;
if (isspace(glyphs[0]))
{
glyphs[0] = 0;
glyphs[1] = 0;
}
else if (isspace(glyphs[1]))
glyphs[1] = 0;
if (glyphs[0] == 0)
{
pglyphs[0] = pglyphs[1] = 0;
p = eol + 1;
continue;
}
v1 = _e_kbd_dict_letter_normalise(glyphs[0]);
v2 = _e_kbd_dict_letter_normalise(glyphs[1]);
if (kd->lookup.tuples[v1][v2] == -1)
kd->lookup.tuples[v1][v2] = (unsigned int)(p - base);
pglyphs[0] = v1;
pglyphs[1] = v2;
}
}
p = eol + 1;
}
}
static int
_e_kbd_dict_open(E_Kbd_Dict *kd)
{
struct stat st;
kd->file.fd = open(kd->file.file, O_RDONLY);
if (kd->file.fd < 0) return 0;
if (fstat(kd->file.fd, &st) < 0)
{
close(kd->file.fd);
return 0;
}
kd->file.size = st.st_size;
eina_mmap_safety_enabled_set(EINA_TRUE);
kd->file.dict = mmap(NULL, kd->file.size, PROT_READ, MAP_SHARED,
kd->file.fd, 0);
if ((kd->file.dict== MAP_FAILED) || (!kd->file.dict))
{
close(kd->file.fd);
return 0;
}
return 1;
}
static void
_e_kbd_dict_close(E_Kbd_Dict *kd)
{
if (kd->file.fd < 0) return;
memset(kd->lookup.tuples, 0, sizeof(kd->lookup.tuples));
munmap((void *)kd->file.dict, kd->file.size);
close(kd->file.fd);
kd->file.fd = -1;
kd->file.dict = NULL;
kd->file.size = 0;
}
EAPI E_Kbd_Dict *
e_kbd_dict_new(const char *file)
{
// alloc and load new dict - build quick-lookup table. words MUST be sorted
E_Kbd_Dict *kd;
_e_kbd_normalise_init();
kd = E_NEW(E_Kbd_Dict, 1);
if (!kd) return NULL;
kd->file.file = eina_stringshare_add(file);
if (!kd->file.file)
{
free(kd);
return NULL;
}
kd->file.fd = -1;
if (!_e_kbd_dict_open(kd))
{
eina_stringshare_del(kd->file.file);
free(kd);
return NULL;
}
_e_kbd_dict_lookup_build(kd);
return kd;
}
EAPI void
e_kbd_dict_free(E_Kbd_Dict *kd)
{
// free dict and anything in it
e_kbd_dict_word_letter_clear(kd);
e_kbd_dict_save(kd);
_e_kbd_dict_close(kd);
free(kd);
}
static E_Kbd_Dict_Word *
_e_kbd_dict_changed_write_find(E_Kbd_Dict *kd, const char *word)
{
Eina_List *l;
for (l = kd->changed.writes; l; l = l->next)
{
E_Kbd_Dict_Word *kw;
kw = l->data;
if (!strcmp(kw->word, word)) return kw;
}
return NULL;
}
EAPI void
e_kbd_dict_save(E_Kbd_Dict *kd)
{
FILE *f;
// XXX: disable personal dict saving for now as we don't merge personal
// XXX: and system dict stats very well at all
return;
// save any changes (new words added, usage adjustments).
// all words MUST be sorted
if (!kd->changed.writes) return;
if (kd->changed.flush_timer)
{
ecore_timer_del(kd->changed.flush_timer);
kd->changed.flush_timer = NULL;
}
ecore_file_unlink(kd->file.file);
f = fopen(kd->file.file, "w");
kd->changed.writes = eina_list_sort(kd->changed.writes,
eina_list_count(kd->changed.writes),
_e_kbd_dict_writes_cb_sort);
if (f)
{
const char *p, *pn;
p = kd->file.dict;
while (p)
{
char *wd;
int usage = 0;
pn = _e_kbd_dict_line_next(kd, p);
if (!pn)
{
fclose(f);
return;
}
wd = _e_kbd_dict_line_parse(kd, p, &usage);
if ((wd) && (strlen(wd) > 0))
{
if (kd->changed.writes)
{
int writeline = 0;
while (kd->changed.writes)
{
E_Kbd_Dict_Word *kw;
int cmp;
kw = kd->changed.writes->data;
cmp = _e_kbd_dict_normalized_strcmp(kw->word, wd);
if (cmp < 0)
{
fprintf(f, "%s %i\n", kw->word, kw->usage);
writeline = 1;
eina_stringshare_del(kw->word);
free(kw);
kd->changed.writes = eina_list_remove_list(kd->changed.writes, kd->changed.writes);
}
else if (cmp == 0)
{
fprintf(f, "%s %i\n", wd, kw->usage);
if (!strcmp(kw->word, wd))
writeline = 0;
else
writeline = 1;
eina_stringshare_del(kw->word);
free(kw);
kd->changed.writes = eina_list_remove_list(kd->changed.writes, kd->changed.writes);
break;
}
else if (cmp > 0)
{
writeline = 1;
break;
}
}
if (writeline)
fprintf(f, "%s %i\n", wd, usage);
}
else
fprintf(f, "%s %i\n", wd, usage);
}
free(wd);
p = pn;
if (p >= (kd->file.dict + kd->file.size)) break;
}
while (kd->changed.writes)
{
E_Kbd_Dict_Word *kw;
kw = kd->changed.writes->data;
fprintf(f, "%s %i\n", kw->word, kw->usage);
eina_stringshare_del(kw->word);
free(kw);
kd->changed.writes = eina_list_remove_list(kd->changed.writes, kd->changed.writes);
}
fclose(f);
}
_e_kbd_dict_close(kd);
if (_e_kbd_dict_open(kd)) _e_kbd_dict_lookup_build(kd);
}
static Eina_Bool
_e_kbd_dict_cb_save_flush(void *data)
{
E_Kbd_Dict *kd;
kd = data;
if ((kd->matches.list) || (kd->word.letters) || (kd->matches.deadends) ||
(kd->matches.leads))
return EINA_TRUE;
kd->changed.flush_timer = NULL;
e_kbd_dict_save(kd);
return EINA_FALSE;
}
static void
_e_kbd_dict_changed_write_add(E_Kbd_Dict *kd, const char *word, int usage)
{
E_Kbd_Dict_Word *kw;
kw = E_NEW(E_Kbd_Dict_Word, 1);
kw->word = eina_stringshare_add(word);
kw->usage = usage;
kd->changed.writes = eina_list_prepend(kd->changed.writes, kw);
if (eina_list_count(kd->changed.writes) > 64)
e_kbd_dict_save(kd);
else
{
if (kd->changed.flush_timer)
ecore_timer_del(kd->changed.flush_timer);
kd->changed.flush_timer =
ecore_timer_add(5.0, _e_kbd_dict_cb_save_flush, kd);
}
}
static const char *
_e_kbd_dict_find_pointer(E_Kbd_Dict *kd, const char *p, int baselen, const char *word)
{
const char *pn;
int len;
if (!p) return NULL;
len = strlen(word);
while (p)
{
pn = _e_kbd_dict_line_next(kd, p);
if (!pn) return NULL;
if ((pn - p) > len)
{
if (!_e_kbd_dict_normalized_strncmp(p, word, len))
return p;
}
if (_e_kbd_dict_normalized_strncmp(p, word, baselen))
return NULL;
p = pn;
if (p >= (kd->file.dict + kd->file.size)) break;
}
return NULL;
}
static const char *
_e_kbd_dict_find(E_Kbd_Dict *kd, const char *word)
{
const char *p;
char *tword;
int glyphs[2], p2, v1, v2, i;
/* work backwards in leads. i.e.:
* going
* goin
* goi
* go
* g
*/
tword = alloca(strlen(word) + 1);
_e_kbd_dict_normalized_strcpy(tword, word);
p = eina_hash_find(kd->matches.leads, tword);
if (p) return p;
p2 = strlen(tword);
while (tword[0])
{
p2 = evas_string_char_prev_get(tword, p2, &i);
if (p2 < 0) break;
tword[p2] = 0;
p = eina_hash_find(kd->matches.leads, tword);
if (p)
return _e_kbd_dict_find_pointer(kd, p, p2, word);
}
/* looking at leads going back letters didn't work */
p = kd->file.dict;
if ((p[0] == '\n') && (kd->file.size <= 1)) return NULL;
glyphs[0] = glyphs[1] = 0;
p2 = evas_string_char_next_get(word, 0, &(glyphs[0]));
if ((p2 > 0) && (glyphs[0] > 0))
p2 = evas_string_char_next_get(word, p2, &(glyphs[1]));
v1 = _e_kbd_dict_letter_normalise(glyphs[0]);
if (glyphs[1] != 0)
{
v2 = _e_kbd_dict_letter_normalise(glyphs[1]);
if (kd->lookup.tuples[v1][v2] >= 0)
p = kd->file.dict + kd->lookup.tuples[v1][v2];
}
else
{
for (i = 0; i < 128; i++)
{
if (kd->lookup.tuples[v1][i] >= 0)
p = kd->file.dict + kd->lookup.tuples[v1][i];
if (p) break;
}
}
return _e_kbd_dict_find_pointer(kd, p, p2, word);
}
static const char *
_e_kbd_dict_find_full(E_Kbd_Dict *kd, const char *word)
{
const char *p;
int len;
p = _e_kbd_dict_find(kd, word);
if (!p) return NULL;
len = strlen(word);
if (isspace(p[len])) return p;
return NULL;
}
EAPI void
e_kbd_dict_word_usage_adjust(E_Kbd_Dict *kd, const char *word, int adjust)
{
// add "adjust" to word usage count
E_Kbd_Dict_Word *kw;
kw = _e_kbd_dict_changed_write_find(kd, word);
if (kw)
{
kw->usage += adjust;
if (kd->changed.flush_timer)
ecore_timer_del(kd->changed.flush_timer);
kd->changed.flush_timer = ecore_timer_add(5.0, _e_kbd_dict_cb_save_flush, kd);
}
else
{
const char *line;
int usage = 0;
line = _e_kbd_dict_find_full(kd, word);
if (line)
{
char *wd;
// FIXME: we need to find an EXACT line match - case and all
wd = _e_kbd_dict_line_parse(kd, line, &usage);
free(wd);
}
usage += adjust;
_e_kbd_dict_changed_write_add(kd, word, usage);
}
}
EAPI void
e_kbd_dict_word_delete(E_Kbd_Dict *kd, const char *word)
{
// delete a word from the dictionary
E_Kbd_Dict_Word *kw;
kw = _e_kbd_dict_changed_write_find(kd, word);
if (kw)
kw->usage = -1;
else
{
if (_e_kbd_dict_find_full(kd, word))
_e_kbd_dict_changed_write_add(kd, word, -1);
}
}
EAPI void
e_kbd_dict_word_letter_clear(E_Kbd_Dict *kd)
{
// clear the current word buffer
while (kd->word.letters)
e_kbd_dict_word_letter_delete(kd);
if (kd->matches.deadends)
{
eina_hash_free(kd->matches.deadends);
kd->matches.deadends = NULL;
}
if (kd->matches.leads)
{
eina_hash_free(kd->matches.leads);
kd->matches.leads = NULL;
}
while (kd->matches.list)
{
E_Kbd_Dict_Word *kw;
kw = kd->matches.list->data;
eina_stringshare_del(kw->word);
free(kw);
kd->matches.list = eina_list_remove_list(kd->matches.list, kd->matches.list);
}
}
EAPI void
e_kbd_dict_word_letter_add(E_Kbd_Dict *kd, const char *letter, int dist)
{
// add a letter with a distance (0 == closest) as an option for the current
// letter position - advance starts a new letter position
Eina_List *l, *list;
E_Kbd_Dict_Letter *kl;
l = eina_list_last(kd->word.letters);
if (!l) return;
list = l->data;
kl = E_NEW(E_Kbd_Dict_Letter, 1);
if (!kl) return;
kl->letter = eina_stringshare_add(letter);
kl->dist = dist;
list = eina_list_append(list, kl);
l->data = list;
}
EAPI void
e_kbd_dict_word_letter_advance(E_Kbd_Dict *kd)
{
// start a new letter in the word
kd->word.letters = eina_list_append(kd->word.letters, NULL);
}
EAPI void
e_kbd_dict_word_letter_delete(E_Kbd_Dict *kd)
{
// delete the current letter completely
Eina_List *l, *list;
l = eina_list_last(kd->word.letters);
if (!l) return;
list = l->data;
while (list)
{
E_Kbd_Dict_Letter *kl;
kl = list->data;
eina_stringshare_del(kl->letter);
free(kl);
list = eina_list_remove_list(list, list);
}
kd->word.letters = eina_list_remove_list(kd->word.letters, l);
}
static void
_e_kbd_dict_matches_lookup_do(E_Kbd_Dict *kd, Eina_List *letters, char *buf, char *bufp, int maxdist, int wordlen, int distance, int *searched, int *found)
{
Eina_List *l;
E_Kbd_Dict_Letter *kl;
const char *p;
char *wd;
E_Kbd_Dict_Word *kw;
int usage = 0, len, d;
if (letters)
{
for (l = letters->data; l; l = l->next)
{
kl = l->data;
len = strlen(kl->letter);
strncpy(bufp, kl->letter, len);
bufp[len] = 0;
if (_e_kbd_dict_find(kd, buf))
{
d = kl->dist;
_e_kbd_dict_matches_lookup_do(kd, letters->next,
buf, bufp + len, maxdist,
wordlen,
distance + (d * d * d),
searched, found);
}
}
return;
}
(*searched)++;
p = _e_kbd_dict_find_full(kd, buf);
if (!p) return;
wd = _e_kbd_dict_line_parse(kd, p, &usage);
if (wd)
{
if (_e_kbd_dict_normalized_strcmp(wd, buf))
{
free(wd);
return;
}
kw = E_NEW(E_Kbd_Dict_Word, 1);
if (kw)
{
int w, b, w2, b2, wc, bc;
// match any capitalisation
for (w = 0, b = 0; wd[w] && buf[b];)
{
b2 = evas_string_char_next_get(buf, b, &bc);
w2 = evas_string_char_next_get(wd, w, &wc);
if (isupper(bc)) wd[w] = toupper(wc);
w = w2;
b = b2;
}
kw->word = eina_stringshare_add(wd);
// FIXME: magic combination of distance metric and
// frequency of usage. this is simple now, but could
// be tweaked
// basically a metric to see how far away the keys that
// were actually pressed are away from the letters of
// this word in a physical on-screen sense
kw->accuracy = (maxdist - distance) / wordlen;
// usage is the frequency of usage in the dictionary.
// it its < 1 time, it's assumed to be 1.
if (usage < 1) usage = 1;
// multiply usage by a factor of 100 for better detailed
// sorting. 10 == 1/10th factor
kw->usage = 10 + (usage - 1);
kd->matches.list = eina_list_append(kd->matches.list, kw);
(*found)++;
}
free(wd);
}
}
EAPI void
e_kbd_dict_matches_lookup(E_Kbd_Dict *kd)
{
Eina_List *l, *ll;
E_Kbd_Dict_Word *kw;
E_Kbd_Dict_Letter *kl;
int searched = 0;
int found = 0;
int maxdist = 0, lettermaxdist, wordlen, d, d1, d2;
char *buf;
// find all matches and sort them
wordlen = eina_list_count(kd->word.letters);
buf = alloca((wordlen + 1) * 10);
while (kd->matches.list)
{
kw = kd->matches.list->data;
eina_stringshare_del(kw->word);
free(kw);
kd->matches.list = eina_list_remove_list(kd->matches.list, kd->matches.list);
}
for (l = kd->word.letters; l; l = l->next)
{
lettermaxdist = 0;
for (ll = l->data; ll; ll = ll->next)
{
kl = ll->data;
d = kl->dist;
lettermaxdist += d * d * d;
}
maxdist += lettermaxdist;
}
if (kd->word.letters)
_e_kbd_dict_matches_lookup_do(kd, kd->word.letters, buf, buf, maxdist,
wordlen, 0, &searched, &found);
d1 = 0x7fffffff;
d2 = 0;
for (l = kd->matches.list; l; l = l->next)
{
kw = l->data;
if (kw->accuracy < d1) d1 = kw->accuracy;
if (kw->accuracy > d2) d2 = kw->accuracy;
}
for (l = kd->matches.list; l; l = l->next)
{
int a;
long long tmp;
kw = l->data;
kw->accuracy -= d1;
a = (kw->accuracy + 10) / 10;
printf("-> %s: acc=%i, use=%i -> ", kw->word, a, kw->usage);
tmp = kw->usage + 100;
tmp *= (a * a);
tmp /= 10000;
tmp *= (a * a);
tmp /= 10000;
tmp *= (a * a);
tmp /= 10000;
tmp *= (a * a);
tmp /= 10000;
kw->usage = tmp / 10000;
printf("%i\n", kw->usage);
}
printf("====== searched: %i/%i\n", found, searched);
kd->matches.list = eina_list_sort(kd->matches.list,
eina_list_count(kd->matches.list),
_e_kbd_dict_matches_lookup_cb_sort);
}
EAPI void
e_kbd_dict_matches_first(E_Kbd_Dict *kd)
{
// jump to first match
kd->matches.list_ptr = kd->matches.list;
}
EAPI void
e_kbd_dict_matches_next(E_Kbd_Dict *kd)
{
// jump to next match
kd->matches.list_ptr = kd->matches.list_ptr->next;
}
EAPI const char *
e_kbd_dict_matches_match_get(E_Kbd_Dict *kd, int *pri_ret)
{
// return the word (string utf-8) for the current match
if (kd->matches.list_ptr)
{
E_Kbd_Dict_Word *kw;
kw = kd->matches.list_ptr->data;
if (kw)
{
*pri_ret = kw->usage;
return kw->word;
}
}
return NULL;
}