Evas textblock: fix wrong hyphenation issues with non UTF8 encoded dictionary

Summary:
hnj_hyphen_hyphenate2() needs properly encoded text based on the given
dictionary. Each dictionary contains its encoding information at the head
of file. So, text will be converted to proper encoding before calling
the function. It fixes T3221.
@fix

Test Plan: Included in Evas test suite.

Reviewers: z-wony, tasn, woohyun, herdsman, Blackmole, minudf

Subscribers: zmike, stefan_schmidt, raster, cedric, jpeg

Tags: #efl

Maniphest Tasks: T3221

Differential Revision: https://phab.enlightenment.org/D3863
This commit is contained in:
Youngbok Shin 2018-05-06 10:24:58 +03:00 committed by Daniel Hirt
parent c33ef15d5d
commit 73c39bcf12
7 changed files with 18784 additions and 15 deletions

View File

@ -2524,7 +2524,9 @@ tests/evas/images/bg_gray.png \
tests/evas/images/flower.jp2 \
tests/evas/images/flower.jp2.png \
tests/evas/images/train.j2k \
tests/evas/images/train.j2k.png
tests/evas/images/train.j2k.png \
tests/evas/dicts/hyph_en_US.dic \
tests/evas/dicts/hyph_de_DE.dic
if HAVE_ELUA

View File

@ -26,14 +26,19 @@ _dicts_hyphen_init(Eo *eo_obj)
}
}
static void *
static HyphenDict *
_dict_hyphen_load(const char *lang)
{
Eina_Iterator *it;
Eina_Iterator *it = NULL;
Eina_File_Direct_Info *dir;
void *dict = NULL;
HyphenDict *dict = NULL;
const char *env_dir = getenv("EVAS_DICTS_HYPHEN_DIR");
if (env_dir && strlen(env_dir) > 0)
it = eina_file_direct_ls(env_dir);
if (!it) it = eina_file_direct_ls(EVAS_DICTS_HYPHEN_DIR);
it = eina_file_direct_ls(EVAS_DICTS_HYPHEN_DIR);
if (!it)
{
ERR("Couldn't list files in hyphens path: %s\n", EVAS_DICTS_HYPHEN_DIR);
@ -103,7 +108,7 @@ _dicts_hyphen_detach(Eo *eo_obj)
/* Returns the hyphen dictionary that matches the given language
* string. The string should be in the format xx_XX e.g. en_US */
static inline void *
static inline HyphenDict *
_hyphen_dict_get_from_lang(const char *lang)
{
if (!lang || !(*lang))
@ -127,13 +132,14 @@ static char *
_layout_wrap_hyphens_get(const Eina_Unicode *text, const char *lang,
int word_start, int word_len)
{
char *utf8;
int utf8_len; /* length of word */
char *hyphens;
char *hyphens = NULL;
char **rep = NULL;
int *pos = NULL;
int *cut = NULL;
void *dict;
HyphenDict *dict;
char *converted_text = NULL;
size_t converted_text_offset = 0;
size_t converted_len = 0;
dict = _hyphen_dict_get_from_lang(lang);
if (!dict)
@ -142,11 +148,44 @@ _layout_wrap_hyphens_get(const Eina_Unicode *text, const char *lang,
return NULL;
}
utf8 = eina_unicode_unicode_to_utf8_range(
text + word_start, word_len, &utf8_len);
hyphens = malloc(sizeof(char) * (word_len + 5));
hnj_hyphen_hyphenate2(dict, utf8, word_len, hyphens, NULL, &rep, &pos, &cut);
free(utf8);
/* Convert UTF-32 encoded text to the other encoding
* which is described in hyphen dictionary. */
if (dict->cset && strcmp(dict->cset, "UTF-32"))
{
converted_text = eina_str_convert_len("UTF-32", dict->cset,
(char *)(text + word_start),
word_len * sizeof(Eina_Unicode),
&converted_len);
if (!converted_text) goto hyphens_done;
/* Skip BOM character (0xFFFE) from converted text */
if ((converted_len >= 2) &&
(converted_text[0] == 0xff) &&
(converted_text[1] == 0xfe))
converted_text_offset = 2;
/* If there is only a BOM character, return NULL */
if (converted_len == converted_text_offset)
goto hyphens_done;
}
if (converted_text)
{
hyphens = malloc(sizeof(char) * (converted_len + 5));
hnj_hyphen_hyphenate2(dict, converted_text + converted_text_offset,
(int)(converted_len - converted_text_offset), hyphens, NULL, &rep, &pos, &cut);
}
else
{
hyphens = malloc(sizeof(char) * (word_len + 5));
hnj_hyphen_hyphenate2(dict, (char *)(text + word_start),
word_len, hyphens, NULL, &rep, &pos, &cut);
}
hyphens_done:
if (converted_text) free(converted_text);
return hyphens;
}

View File

@ -0,0 +1,30 @@
Hyphenation dictionary
----------------------
Language: German (de DE).
Origin: Based on the TeX hyphenation tables
http://www.tug.org/tex-archive/language/hyphenation/dehyphn.tex
License: GNU LGPL license.
Author: conversion author is Marco Huggenberger<marco@by-night.ch>
revised conversion and extensions: Daniel Naber
http://qa.openoffice.org/issues/show_bug.cgi?id=26355
Please note, this dictionary is based on syllable matching patterns
and thus should be suitable under other variations of German
HYPH de DE hyph_de_DE
HYPH de CH hyph_de_CH
--------------------------------------------------------------------------------
Trennmuster (hyph_de_DE.dic):
--------------------------------------------------------------------------------
Die Trennmuster (hyph_de_DE.dic) basieren auf den TeX Trennmustern
"dehyphn.tex", revision level 31.
Lizenz der Trennmuster: GNU LGPL. Die Anpassung der Trennmuster an
den in OpenOffice.org benutzten "ALTLinux LibHnj Hyphenator" wurde
mit dem Script substrings.pl durchgeführt, das unter
http://lingucomponent.openoffice.org/hyphenator.html als Teil der
Datei altlinux_Hyph.zip heruntergeladen werden kann.
Die Original-Trennmuster können hier heruntergeladen werden:
http://www.tug.org/tex-archive/language/hyphenation/dehyphn.tex

View File

@ -0,0 +1,53 @@
hyph_en_US.dic - American English hyphenation patterns for OpenOffice.org
version 2010-02-23
Changes
- set correct LEFTHYPHENMIN = 2, RIGHTHYPHENMIN = 3
- handle apostrophes (forbid *o'=clock etc.)
- set COMPOUNDLEFTHYPHENMIN, COMPOUNDRIGHTHYPHENMIN values
- UTF-8 encoding
- Unicode ligature support
License
BSD-style. Unlimited copying, redistribution and modification of this file
is permitted with this copyright and license information.
See original license in this file.
Conversion and modifications by László Németh (nemeth at OOo).
Based on the plain TeX hyphenation table
(http://tug.ctan.org/text-archive/macros/plain/base/hyphen.tex) and
the TugBoat hyphenation exceptions log in
http://www.ctan.org/tex-archive/info/digests/tugboat/tb0hyf.tex, processed
by the hyphenex.sh script (see in the same directory).
Originally developed and distributed with the Hyphen hyphenation library,
see http://hunspell.sourceforge.net/ for the source files and the conversion
scripts.
Licenses
hyphen.tex:
% The Plain TeX hyphenation tables [NOT TO BE CHANGED IN ANY WAY!]
% Unlimited copying and redistribution of this file are permitted as long
% as this file is not modified. Modifications are permitted, but only if
% the resulting file is not named hyphen.tex.
output of hyphenex.sh:
% Hyphenation exceptions for US English, based on hyphenation exception
% log articles in TUGboat.
%
% Copyright 2007 TeX Users Group.
% You may freely use, modify and/or distribute this file.
%
% This is an automatically generated file. Do not edit!
%
% Please contact the TUGboat editorial staff <tugboat@tug.org>
% for corrections and omissions.
hyph_en_US.txt:
See the previous licenses.

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -16,6 +16,8 @@
#include "evas_suite.h"
#include "evas_tests_helpers.h"
#define TESTS_DIC_DIR TESTS_SRC_DIR"/dicts"
/* Functions defined in evas_object_textblock.c */
EAPI Eina_Bool
_evas_textblock_check_item_node_link(Evas_Object *obj);
@ -4150,6 +4152,21 @@ EFL_START_TEST(evas_textblock_hyphenation)
evas_object_textblock_text_markup_set(tb, buf);
_hyphenation_width_stress(tb, cur);
setenv("EVAS_DICTS_HYPHEN_DIR", TESTS_DIC_DIR, 1);
buf = "europäi-";
evas_object_textblock_text_markup_set(tb, buf);
evas_object_textblock_size_formatted_get(tb, &w, NULL);
buf = "europäischen";
evas_object_textblock_text_markup_set(tb, buf);
evas_textblock_cursor_format_prepend(cur, "<wrap=hyphenation lang=de_DE>");
evas_object_resize(tb, w, 100);
evas_object_textblock_size_formatted_get(tb, &fw, NULL);
ck_assert_int_eq(w, fw);
unsetenv("EVAS_DICTS_HYPHEN_DIR");
END_TB_TEST();
}
EFL_END_TEST;