efl/src/static_libs/liblinebreak/linebreak.c

738 lines
22 KiB
C

/* vim: set tabstop=4 shiftwidth=4: */
/*
* Line breaking in a Unicode sequence. Designed to be used in a
* generic text renderer.
*
* Copyright (C) 2008-2010 Wu Yongwei <wuyongwei at gmail dot com>
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the author be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute
* it freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must
* not claim that you wrote the original software. If you use this
* software in a product, an acknowledgement in the product
* documentation would be appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must
* not be misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source
* distribution.
*
* The main reference is Unicode Standard Annex 14 (UAX #14):
* <URL:http://www.unicode.org/reports/tr14/>
*
* When this library was designed, this annex was at Revision 19, for
* Unicode 5.0.0:
* <URL:http://www.unicode.org/reports/tr14/tr14-19.html>
*
* This library has been updated according to Revision 24, for
* Unicode 5.2.0:
* <URL:http://www.unicode.org/reports/tr14/tr14-24.html>
*
* The Unicode Terms of Use are available at
* <URL:http://www.unicode.org/copyright.html>
*/
/**
* @file linebreak.c
*
* Implementation of the line breaking algorithm as described in Unicode
* Standard Annex 14.
*
* @version 2.0, 2010/01/03
* @author Wu Yongwei
*/
#include <assert.h>
#include <stddef.h>
#include <string.h>
#include "linebreak.h"
#include "linebreakdef.h"
/**
* Size of the second-level index to the line breaking properties.
*/
#define LINEBREAK_INDEX_SIZE 40
/**
* Version number of the library.
*/
const int linebreak_version = LINEBREAK_VERSION;
/**
* Enumeration of break actions. They are used in the break action
* pair table below.
*/
enum BreakAction
{
DIR_BRK, /**< Direct break opportunity */
IND_BRK, /**< Indirect break opportunity */
CMI_BRK, /**< Indirect break opportunity for combining marks */
CMP_BRK, /**< Prohibited break for combining marks */
PRH_BRK /**< Prohibited break */
};
/**
* Break action pair table. This is a direct mapping of Table 2 of
* Unicode Standard Annex 14, Revision 24.
*/
static enum BreakAction baTable[LBP_JT][LBP_JT] = {
{ /* OP */
PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, CMP_BRK,
PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK },
{ /* CL */
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, PRH_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
{ /* CP */
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, PRH_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK,
DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
{ /* QU */
PRH_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, CMI_BRK,
PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK },
{ /* GL */
IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, CMI_BRK,
PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK },
{ /* NS */
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
{ /* EX */
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
{ /* SY */
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK,
DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
{ /* IS */
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK,
DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
{ /* PR */
IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK },
{ /* PO */
IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK,
DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
{ /* NU */
IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK,
IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
{ /* AL */
IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK,
IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
{ /* ID */
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
{ /* IN */
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
{ /* HY */
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, DIR_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK,
DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
{ /* BA */
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, DIR_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
{ /* BB */
IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, CMI_BRK,
PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK },
{ /* B2 */
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, PRH_BRK, PRH_BRK, CMI_BRK,
PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
{ /* ZW */
DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, PRH_BRK, DIR_BRK,
DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
{ /* CM */
IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK,
IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
{ /* WJ */
IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, CMI_BRK,
PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK },
{ /* H2 */
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK },
{ /* H3 */
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK },
{ /* JL */
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK },
{ /* JV */
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK },
{ /* JT */
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK }
};
/**
* Struct for the second-level index to the line breaking properties.
*/
struct LineBreakPropertiesIndex
{
utf32_t end; /**< End coding point */
struct LineBreakProperties *lbp;/**< Pointer to line breaking properties */
};
/**
* Second-level index to the line breaking properties.
*/
static struct LineBreakPropertiesIndex lb_prop_index[LINEBREAK_INDEX_SIZE] =
{
{ 0xFFFFFFFF, lb_prop_default }
};
/**
* Initializes the second-level index to the line breaking properties.
* If it is not called, the performance of #get_char_lb_class_lang (and
* thus the main functionality) can be pretty bad, especially for big
* code points like those of Chinese.
*/
void init_linebreak(void)
{
size_t i;
size_t iPropDefault;
size_t len;
size_t step;
len = 0;
while (lb_prop_default[len].prop != LBP_Undefined)
++len;
step = len / LINEBREAK_INDEX_SIZE;
iPropDefault = 0;
for (i = 0; i < LINEBREAK_INDEX_SIZE; ++i)
{
lb_prop_index[i].lbp = lb_prop_default + iPropDefault;
iPropDefault += step;
lb_prop_index[i].end = lb_prop_default[iPropDefault].start - 1;
}
lb_prop_index[--i].end = 0xFFFFFFFF;
}
/**
* Gets the language-specific line breaking properties.
*
* @param lang language of the text
* @return pointer to the language-specific line breaking
* properties array if found; \c NULL otherwise
*/
static struct LineBreakProperties *get_lb_prop_lang(const char *lang)
{
struct LineBreakPropertiesLang *lbplIter;
if (lang != NULL)
{
for (lbplIter = lb_prop_lang_map; lbplIter->lang != NULL; ++lbplIter)
{
if (strncmp(lang, lbplIter->lang, lbplIter->namelen) == 0)
{
return lbplIter->lbp;
}
}
}
return NULL;
}
/**
* Gets the line breaking class of a character from a line breaking
* properties array.
*
* @param ch character to check
* @param lbp pointer to the line breaking properties array
* @return the line breaking class if found; \c LBP_XX otherwise
*/
static enum LineBreakClass get_char_lb_class(
utf32_t ch,
struct LineBreakProperties *lbp)
{
while (lbp->prop != LBP_Undefined && ch >= lbp->start)
{
if (ch <= lbp->end)
return lbp->prop;
++lbp;
}
return LBP_XX;
}
/**
* Gets the line breaking class of a character from the default line
* breaking properties array.
*
* @param ch character to check
* @return the line breaking class if found; \c LBP_XX otherwise
*/
static enum LineBreakClass get_char_lb_class_default(
utf32_t ch)
{
size_t i = 0;
while (ch > lb_prop_index[i].end)
++i;
assert(i < LINEBREAK_INDEX_SIZE);
return get_char_lb_class(ch, lb_prop_index[i].lbp);
}
/**
* Gets the line breaking class of a character for a specific
* language. This function will check the language-specific data first,
* and then the default data if there is no language-specific property
* available for the character.
*
* @param ch character to check
* @param lbpLang pointer to the language-specific line breaking
* properties array
* @return the line breaking class if found; \c LBP_XX
* otherwise
*/
static enum LineBreakClass get_char_lb_class_lang(
utf32_t ch,
struct LineBreakProperties *lbpLang)
{
enum LineBreakClass lbcResult;
/* Find the language-specific line breaking class for a character */
if (lbpLang)
{
lbcResult = get_char_lb_class(ch, lbpLang);
if (lbcResult != LBP_XX)
return lbcResult;
}
/* Find the generic language-specific line breaking class, if no
* language context is provided, or language-specific data are not
* available for the specific character in the specified language */
return get_char_lb_class_default(ch);
}
/**
* Resolves the line breaking class for certain ambiguous or complicated
* characters. They are treated in a simplistic way in this
* implementation.
*
* @param lbc line breaking class to resolve
* @param lang language of the text
* @return the resolved line breaking class
*/
static enum LineBreakClass resolve_lb_class(
enum LineBreakClass lbc,
const char *lang)
{
switch (lbc)
{
case LBP_AI:
if (lang != NULL &&
(strncmp(lang, "zh", 2) == 0 || /* Chinese */
strncmp(lang, "ja", 2) == 0 || /* Japanese */
strncmp(lang, "ko", 2) == 0)) /* Korean */
{
return LBP_ID;
}
/* Fall through */
case LBP_SA:
case LBP_SG:
case LBP_XX:
return LBP_AL;
default:
return lbc;
}
}
/**
* Gets the next Unicode character in a UTF-8 sequence. The index will
* be advanced to the next complete character, unless the end of string
* is reached in the middle of a UTF-8 sequence.
*
* @param[in] s input UTF-8 string
* @param[in] len length of the string in bytes
* @param[in,out] ip pointer to the index
* @return the Unicode character beginning at the index; or
* #EOS if end of input is encountered
*/
utf32_t lb_get_next_char_utf8(
const utf8_t *s,
size_t len,
size_t *ip)
{
utf8_t ch;
utf32_t res;
assert(*ip <= len);
if (*ip == len)
return EOS;
ch = s[*ip];
if (ch < 0xC2 || ch > 0xF4)
{ /* One-byte sequence, tail (should not occur), or invalid */
*ip += 1;
return ch;
}
else if (ch < 0xE0)
{ /* Two-byte sequence */
if (*ip + 2 > len)
return EOS;
res = ((ch & 0x1F) << 6) + (s[*ip + 1] & 0x3F);
*ip += 2;
return res;
}
else if (ch < 0xF0)
{ /* Three-byte sequence */
if (*ip + 3 > len)
return EOS;
res = ((ch & 0x0F) << 12) +
((s[*ip + 1] & 0x3F) << 6) +
((s[*ip + 2] & 0x3F));
*ip += 3;
return res;
}
else
{ /* Four-byte sequence */
if (*ip + 4 > len)
return EOS;
res = ((ch & 0x07) << 18) +
((s[*ip + 1] & 0x3F) << 12) +
((s[*ip + 2] & 0x3F) << 6) +
((s[*ip + 3] & 0x3F));
*ip += 4;
return res;
}
}
/**
* Gets the next Unicode character in a UTF-16 sequence. The index will
* be advanced to the next complete character, unless the end of string
* is reached in the middle of a UTF-16 surrogate pair.
*
* @param[in] s input UTF-16 string
* @param[in] len length of the string in words
* @param[in,out] ip pointer to the index
* @return the Unicode character beginning at the index; or
* #EOS if end of input is encountered
*/
utf32_t lb_get_next_char_utf16(
const utf16_t *s,
size_t len,
size_t *ip)
{
utf16_t ch;
assert(*ip <= len);
if (*ip == len)
return EOS;
ch = s[(*ip)++];
if (ch < 0xD800 || ch > 0xDBFF)
{ /* If the character is not a high surrogate */
return ch;
}
if (*ip == len)
{ /* If the input ends here (an error) */
--(*ip);
return EOS;
}
if (s[*ip] < 0xDC00 || s[*ip] > 0xDFFF)
{ /* If the next character is not the low surrogate (an error) */
return ch;
}
/* Return the constructed character and advance the index again */
return (((utf32_t)ch & 0x3FF) << 10) + (s[(*ip)++] & 0x3FF) + 0x10000;
}
/**
* Gets the next Unicode character in a UTF-32 sequence. The index will
* be advanced to the next character.
*
* @param[in] s input UTF-32 string
* @param[in] len length of the string in dwords
* @param[in,out] ip pointer to the index
* @return the Unicode character beginning at the index; or
* #EOS if end of input is encountered
*/
utf32_t lb_get_next_char_utf32(
const utf32_t *s,
size_t len,
size_t *ip)
{
assert(*ip <= len);
if (*ip == len)
return EOS;
return s[(*ip)++];
}
/**
* Sets the line breaking information for a generic input string.
*
* @param[in] s input string
* @param[in] len length of the input
* @param[in] lang language of the input
* @param[out] brks pointer to the output breaking data,
* containing #LINEBREAK_MUSTBREAK,
* #LINEBREAK_ALLOWBREAK, #LINEBREAK_NOBREAK,
* or #LINEBREAK_INSIDEACHAR
* @param[in] get_next_char function to get the next UTF-32 character
*/
void set_linebreaks(
const void *s,
size_t len,
const char *lang,
char *brks,
get_next_char_t get_next_char)
{
utf32_t ch;
enum LineBreakClass lbcCur;
enum LineBreakClass lbcNew;
enum LineBreakClass lbcLast;
struct LineBreakProperties *lbpLang;
size_t posCur = 0;
size_t posLast = 0;
--posLast; /* To be ++'d later */
ch = get_next_char(s, len, &posCur);
if (ch == EOS)
return;
lbpLang = get_lb_prop_lang(lang);
lbcCur = resolve_lb_class(get_char_lb_class_lang(ch, lbpLang), lang);
lbcNew = LBP_Undefined;
nextline:
/* Special treatment for the first character */
switch (lbcCur)
{
case LBP_LF:
case LBP_NL:
lbcCur = LBP_BK;
break;
case LBP_CB:
lbcCur = LBP_BA;
break;
case LBP_SP:
lbcCur = LBP_WJ;
break;
default:
break;
}
/* Process a line till an explicit break or end of string */
for (;;)
{
for (++posLast; posLast < posCur - 1; ++posLast)
{
brks[posLast] = LINEBREAK_INSIDEACHAR;
}
assert(posLast == posCur - 1);
lbcLast = lbcNew;
ch = get_next_char(s, len, &posCur);
if (ch == EOS)
break;
lbcNew = get_char_lb_class_lang(ch, lbpLang);
if (lbcCur == LBP_BK || (lbcCur == LBP_CR && lbcNew != LBP_LF))
{
brks[posLast] = LINEBREAK_MUSTBREAK;
lbcCur = resolve_lb_class(lbcNew, lang);
goto nextline;
}
switch (lbcNew)
{
case LBP_SP:
brks[posLast] = LINEBREAK_NOBREAK;
continue;
case LBP_BK:
case LBP_LF:
case LBP_NL:
brks[posLast] = LINEBREAK_NOBREAK;
lbcCur = LBP_BK;
continue;
case LBP_CR:
brks[posLast] = LINEBREAK_NOBREAK;
lbcCur = LBP_CR;
continue;
case LBP_CB:
brks[posLast] = LINEBREAK_ALLOWBREAK;
lbcCur = LBP_BA;
continue;
default:
break;
}
lbcNew = resolve_lb_class(lbcNew, lang);
assert(lbcCur <= LBP_JT);
assert(lbcNew <= LBP_JT);
switch (baTable[lbcCur - 1][lbcNew - 1])
{
case DIR_BRK:
brks[posLast] = LINEBREAK_ALLOWBREAK;
break;
case CMI_BRK:
case IND_BRK:
if (lbcLast == LBP_SP)
{
brks[posLast] = LINEBREAK_ALLOWBREAK;
}
else
{
brks[posLast] = LINEBREAK_NOBREAK;
}
break;
case CMP_BRK:
brks[posLast] = LINEBREAK_NOBREAK;
if (lbcLast != LBP_SP)
continue;
break;
case PRH_BRK:
brks[posLast] = LINEBREAK_NOBREAK;
break;
}
lbcCur = lbcNew;
}
assert(posLast == posCur - 1 && posCur <= len);
/* Break after the last character */
brks[posLast] = LINEBREAK_MUSTBREAK;
/* When the input contains incomplete sequences */
while (posCur < len)
{
brks[posCur++] = LINEBREAK_INSIDEACHAR;
}
}
/**
* Sets the line breaking information for a UTF-8 input string.
*
* @param[in] s input UTF-8 string
* @param[in] len length of the input
* @param[in] lang language of the input
* @param[out] brks pointer to the output breaking data, containing
* #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
* #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
*/
void set_linebreaks_utf8(
const utf8_t *s,
size_t len,
const char *lang,
char *brks)
{
set_linebreaks(s, len, lang, brks,
(get_next_char_t)lb_get_next_char_utf8);
}
/**
* Sets the line breaking information for a UTF-16 input string.
*
* @param[in] s input UTF-16 string
* @param[in] len length of the input
* @param[in] lang language of the input
* @param[out] brks pointer to the output breaking data, containing
* #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
* #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
*/
void set_linebreaks_utf16(
const utf16_t *s,
size_t len,
const char *lang,
char *brks)
{
set_linebreaks(s, len, lang, brks,
(get_next_char_t)lb_get_next_char_utf16);
}
/**
* Sets the line breaking information for a UTF-32 input string.
*
* @param[in] s input UTF-32 string
* @param[in] len length of the input
* @param[in] lang language of the input
* @param[out] brks pointer to the output breaking data, containing
* #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
* #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
*/
void set_linebreaks_utf32(
const utf32_t *s,
size_t len,
const char *lang,
char *brks)
{
set_linebreaks(s, len, lang, brks,
(get_next_char_t)lb_get_next_char_utf32);
}
/**
* Tells whether a line break can occur between two Unicode characters.
* This is a wrapper function to expose a simple interface. Generally
* speaking, it is better to use #set_linebreaks_utf32 instead, since
* complicated cases involving combining marks, spaces, etc. cannot be
* correctly processed.
*
* @param char1 the first Unicode character
* @param char2 the second Unicode character
* @param lang language of the input
* @return one of #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
* #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
*/
int is_line_breakable(
utf32_t char1,
utf32_t char2,
const char* lang)
{
utf32_t s[2];
char brks[2];
s[0] = char1;
s[1] = char2;
set_linebreaks_utf32(s, 2, lang, brks);
return brks[0];
}