Synced libunibreak local copy with upstream.

This commit is contained in:
Tom Hacohen 2013-08-07 11:56:46 +01:00
parent 3cc3d0878a
commit 4185694ecb
3 changed files with 146 additions and 45 deletions

View File

@ -40,7 +40,7 @@
* Implementation of the word breaking algorithm as described in Unicode
* Standard Annex 29.
*
* @version 2.2, 2012/02/04
* @version 2.3, 2013/05/14
* @author Tom Hacohen
*/
@ -188,7 +188,7 @@ static void set_wordbreaks(
switch (wbcCur)
{
case WBP_CR:
case WBP_CR:
/* WB3b */
set_brks_to(s, brks, posLast, posCur, len,
WORDBREAK_BREAK, get_next_char);
@ -196,7 +196,7 @@ static void set_wordbreaks(
posLast = posCur;
break;
case WBP_LF:
case WBP_LF:
if (wbcSeqStart == WBP_CR) /* WB3 */
{
set_brks_to(s, brks, posLast, posCur, len,
@ -207,7 +207,7 @@ static void set_wordbreaks(
}
/* Fall off */
case WBP_Newline:
case WBP_Newline:
/* WB3a,3b */
set_brks_to(s, brks, posLast, posCur, len,
WORDBREAK_BREAK, get_next_char);
@ -215,8 +215,8 @@ static void set_wordbreaks(
posLast = posCur;
break;
case WBP_Extend:
case WBP_Format:
case WBP_Extend:
case WBP_Format:
/* WB4 - If not the first char/after a newline (WB3a,3b), skip
* this class, set it to be the same as the prev, and mark
* brks not to break before them. */
@ -235,7 +235,7 @@ static void set_wordbreaks(
}
break;
case WBP_Katakana:
case WBP_Katakana:
if ((wbcSeqStart == WBP_Katakana) || /* WB13 */
(wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */
{
@ -252,7 +252,7 @@ static void set_wordbreaks(
posLast = posCur;
break;
case WBP_ALetter:
case WBP_ALetter:
if ((wbcSeqStart == WBP_ALetter) || /* WB5,6,7 */
(wbcLast == WBP_Numeric) || /* WB10 */
(wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */
@ -270,7 +270,7 @@ static void set_wordbreaks(
posLast = posCur;
break;
case WBP_MidNumLet:
case WBP_MidNumLet:
if ((wbcLast == WBP_ALetter) || /* WB6,7 */
(wbcLast == WBP_Numeric)) /* WB11,12 */
{
@ -285,7 +285,7 @@ static void set_wordbreaks(
}
break;
case WBP_MidLetter:
case WBP_MidLetter:
if (wbcLast == WBP_ALetter) /* WB6,7 */
{
/* Go on */
@ -299,7 +299,7 @@ static void set_wordbreaks(
}
break;
case WBP_MidNum:
case WBP_MidNum:
if (wbcLast == WBP_Numeric) /* WB11,12 */
{
/* Go on */
@ -313,7 +313,7 @@ static void set_wordbreaks(
}
break;
case WBP_Numeric:
case WBP_Numeric:
if ((wbcSeqStart == WBP_Numeric) || /* WB8,11,12 */
(wbcLast == WBP_ALetter) || /* WB9 */
(wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */
@ -331,7 +331,7 @@ static void set_wordbreaks(
posLast = posCur;
break;
case WBP_ExtendNumLet:
case WBP_ExtendNumLet:
/* WB13a,13b */
if ((wbcSeqStart == wbcLast) &&
((wbcLast == WBP_ALetter) ||
@ -352,7 +352,18 @@ static void set_wordbreaks(
posLast = posCur;
break;
case WBP_Any:
case WBP_Regional:
/* WB13c */
if (wbcSeqStart == WBP_Regional)
{
set_brks_to(s, brks, posLast, posCur, len,
WORDBREAK_NOBREAK, get_next_char);
}
wbcSeqStart = wbcCur;
posLast = posCur;
break;
case WBP_Any:
/* Allow breaks and reset */
set_brks_to(s, brks, posLast, posCur, len,
WORDBREAK_BREAK, get_next_char);

View File

@ -1,6 +1,6 @@
/* The content of this file is generated from:
# WordBreakProperty-6.0.0.txt
# Date: 2010-08-19, 00:48:48 GMT [MD]
# WordBreakProperty-6.2.0.txt
# Date: 2012-08-13, 19:12:09 GMT [MD]
*/
#include "linebreak.h"
@ -69,7 +69,7 @@ static struct WordBreakProperties wb_prop_default[] = {
{0x05F0, 0x05F2, WBP_ALetter},
{0x05F3, 0x05F3, WBP_ALetter},
{0x05F4, 0x05F4, WBP_MidLetter},
{0x0600, 0x0603, WBP_Format},
{0x0600, 0x0604, WBP_Format},
{0x060C, 0x060D, WBP_MidNum},
{0x0610, 0x061A, WBP_Extend},
{0x0620, 0x063F, WBP_ALetter},
@ -117,6 +117,9 @@ static struct WordBreakProperties wb_prop_default[] = {
{0x0829, 0x082D, WBP_Extend},
{0x0840, 0x0858, WBP_ALetter},
{0x0859, 0x085B, WBP_Extend},
{0x08A0, 0x08A0, WBP_ALetter},
{0x08A2, 0x08AC, WBP_ALetter},
{0x08E4, 0x08FE, WBP_Extend},
{0x0900, 0x0902, WBP_Extend},
{0x0903, 0x0903, WBP_Extend},
{0x0904, 0x0939, WBP_ALetter},
@ -360,9 +363,11 @@ static struct WordBreakProperties wb_prop_default[] = {
{0x109A, 0x109C, WBP_Extend},
{0x109D, 0x109D, WBP_Extend},
{0x10A0, 0x10C5, WBP_ALetter},
{0x10C7, 0x10C7, WBP_ALetter},
{0x10CD, 0x10CD, WBP_ALetter},
{0x10D0, 0x10FA, WBP_ALetter},
{0x10FC, 0x10FC, WBP_ALetter},
{0x1100, 0x1248, WBP_ALetter},
{0x10FD, 0x1248, WBP_ALetter},
{0x124A, 0x124D, WBP_ALetter},
{0x1250, 0x1256, WBP_ALetter},
{0x1258, 0x1258, WBP_ALetter},
@ -396,7 +401,7 @@ static struct WordBreakProperties wb_prop_default[] = {
{0x1760, 0x176C, WBP_ALetter},
{0x176E, 0x1770, WBP_ALetter},
{0x1772, 0x1773, WBP_Extend},
{0x17B4, 0x17B5, WBP_Format},
{0x17B4, 0x17B5, WBP_Extend},
{0x17B6, 0x17B6, WBP_Extend},
{0x17B7, 0x17BD, WBP_Extend},
{0x17BE, 0x17C5, WBP_Extend},
@ -466,9 +471,11 @@ static struct WordBreakProperties wb_prop_default[] = {
{0x1BA6, 0x1BA7, WBP_Extend},
{0x1BA8, 0x1BA9, WBP_Extend},
{0x1BAA, 0x1BAA, WBP_Extend},
{0x1BAB, 0x1BAB, WBP_Extend},
{0x1BAC, 0x1BAD, WBP_Extend},
{0x1BAE, 0x1BAF, WBP_ALetter},
{0x1BB0, 0x1BB9, WBP_Numeric},
{0x1BC0, 0x1BE5, WBP_ALetter},
{0x1BBA, 0x1BE5, WBP_ALetter},
{0x1BE6, 0x1BE6, WBP_Extend},
{0x1BE7, 0x1BE7, WBP_Extend},
{0x1BE8, 0x1BE9, WBP_Extend},
@ -494,10 +501,12 @@ static struct WordBreakProperties wb_prop_default[] = {
{0x1CE9, 0x1CEC, WBP_ALetter},
{0x1CED, 0x1CED, WBP_Extend},
{0x1CEE, 0x1CF1, WBP_ALetter},
{0x1CF2, 0x1CF2, WBP_Extend},
{0x1CF2, 0x1CF3, WBP_Extend},
{0x1CF4, 0x1CF4, WBP_Extend},
{0x1CF5, 0x1CF6, WBP_ALetter},
{0x1D00, 0x1D2B, WBP_ALetter},
{0x1D2C, 0x1D61, WBP_ALetter},
{0x1D62, 0x1D77, WBP_ALetter},
{0x1D2C, 0x1D6A, WBP_ALetter},
{0x1D6B, 0x1D77, WBP_ALetter},
{0x1D78, 0x1D78, WBP_ALetter},
{0x1D79, 0x1D9A, WBP_ALetter},
{0x1D9B, 0x1DBF, WBP_ALetter},
@ -565,13 +574,16 @@ static struct WordBreakProperties wb_prop_default[] = {
{0x24B6, 0x24E9, WBP_ALetter},
{0x2C00, 0x2C2E, WBP_ALetter},
{0x2C30, 0x2C5E, WBP_ALetter},
{0x2C60, 0x2C7C, WBP_ALetter},
{0x2C7D, 0x2C7D, WBP_ALetter},
{0x2C60, 0x2C7B, WBP_ALetter},
{0x2C7C, 0x2C7D, WBP_ALetter},
{0x2C7E, 0x2CE4, WBP_ALetter},
{0x2CEB, 0x2CEE, WBP_ALetter},
{0x2CEF, 0x2CF1, WBP_Extend},
{0x2CF2, 0x2CF3, WBP_ALetter},
{0x2D00, 0x2D25, WBP_ALetter},
{0x2D30, 0x2D65, WBP_ALetter},
{0x2D27, 0x2D27, WBP_ALetter},
{0x2D2D, 0x2D2D, WBP_ALetter},
{0x2D30, 0x2D67, WBP_ALetter},
{0x2D6F, 0x2D6F, WBP_ALetter},
{0x2D7F, 0x2D7F, WBP_Extend},
{0x2D80, 0x2D96, WBP_ALetter},
@ -586,7 +598,8 @@ static struct WordBreakProperties wb_prop_default[] = {
{0x2DE0, 0x2DFF, WBP_Extend},
{0x2E2F, 0x2E2F, WBP_ALetter},
{0x3005, 0x3005, WBP_ALetter},
{0x302A, 0x302F, WBP_Extend},
{0x302A, 0x302D, WBP_Extend},
{0x302E, 0x302F, WBP_Extend},
{0x3031, 0x3035, WBP_Katakana},
{0x303B, 0x303B, WBP_ALetter},
{0x303C, 0x303C, WBP_ALetter},
@ -616,9 +629,10 @@ static struct WordBreakProperties wb_prop_default[] = {
{0xA66E, 0xA66E, WBP_ALetter},
{0xA66F, 0xA66F, WBP_Extend},
{0xA670, 0xA672, WBP_Extend},
{0xA67C, 0xA67D, WBP_Extend},
{0xA674, 0xA67D, WBP_Extend},
{0xA67F, 0xA67F, WBP_ALetter},
{0xA680, 0xA697, WBP_ALetter},
{0xA69F, 0xA69F, WBP_Extend},
{0xA6A0, 0xA6E5, WBP_ALetter},
{0xA6E6, 0xA6EF, WBP_ALetter},
{0xA6F0, 0xA6F1, WBP_Extend},
@ -628,8 +642,9 @@ static struct WordBreakProperties wb_prop_default[] = {
{0xA771, 0xA787, WBP_ALetter},
{0xA788, 0xA788, WBP_ALetter},
{0xA78B, 0xA78E, WBP_ALetter},
{0xA790, 0xA791, WBP_ALetter},
{0xA7A0, 0xA7A9, WBP_ALetter},
{0xA790, 0xA793, WBP_ALetter},
{0xA7A0, 0xA7AA, WBP_ALetter},
{0xA7F8, 0xA7F9, WBP_ALetter},
{0xA7FA, 0xA7FA, WBP_ALetter},
{0xA7FB, 0xA801, WBP_ALetter},
{0xA802, 0xA802, WBP_Extend},
@ -686,6 +701,14 @@ static struct WordBreakProperties wb_prop_default[] = {
{0xAAB7, 0xAAB8, WBP_Extend},
{0xAABE, 0xAABF, WBP_Extend},
{0xAAC1, 0xAAC1, WBP_Extend},
{0xAAE0, 0xAAEA, WBP_ALetter},
{0xAAEB, 0xAAEB, WBP_Extend},
{0xAAEC, 0xAAED, WBP_Extend},
{0xAAEE, 0xAAEF, WBP_Extend},
{0xAAF2, 0xAAF2, WBP_ALetter},
{0xAAF3, 0xAAF4, WBP_ALetter},
{0xAAF5, 0xAAF5, WBP_Extend},
{0xAAF6, 0xAAF6, WBP_Extend},
{0xAB01, 0xAB06, WBP_ALetter},
{0xAB09, 0xAB0E, WBP_ALetter},
{0xAB11, 0xAB16, WBP_ALetter},
@ -781,6 +804,8 @@ static struct WordBreakProperties wb_prop_default[] = {
{0x1083F, 0x10855, WBP_ALetter},
{0x10900, 0x10915, WBP_ALetter},
{0x10920, 0x10939, WBP_ALetter},
{0x10980, 0x109B7, WBP_ALetter},
{0x109BE, 0x109BF, WBP_ALetter},
{0x10A00, 0x10A00, WBP_ALetter},
{0x10A01, 0x10A03, WBP_Extend},
{0x10A05, 0x10A06, WBP_Extend},
@ -809,10 +834,40 @@ static struct WordBreakProperties wb_prop_default[] = {
{0x110B7, 0x110B8, WBP_Extend},
{0x110B9, 0x110BA, WBP_Extend},
{0x110BD, 0x110BD, WBP_Format},
{0x110D0, 0x110E8, WBP_ALetter},
{0x110F0, 0x110F9, WBP_Numeric},
{0x11100, 0x11102, WBP_Extend},
{0x11103, 0x11126, WBP_ALetter},
{0x11127, 0x1112B, WBP_Extend},
{0x1112C, 0x1112C, WBP_Extend},
{0x1112D, 0x11134, WBP_Extend},
{0x11136, 0x1113F, WBP_Numeric},
{0x11180, 0x11181, WBP_Extend},
{0x11182, 0x11182, WBP_Extend},
{0x11183, 0x111B2, WBP_ALetter},
{0x111B3, 0x111B5, WBP_Extend},
{0x111B6, 0x111BE, WBP_Extend},
{0x111BF, 0x111C0, WBP_Extend},
{0x111C1, 0x111C4, WBP_ALetter},
{0x111D0, 0x111D9, WBP_Numeric},
{0x11680, 0x116AA, WBP_ALetter},
{0x116AB, 0x116AB, WBP_Extend},
{0x116AC, 0x116AC, WBP_Extend},
{0x116AD, 0x116AD, WBP_Extend},
{0x116AE, 0x116AF, WBP_Extend},
{0x116B0, 0x116B5, WBP_Extend},
{0x116B6, 0x116B6, WBP_Extend},
{0x116B7, 0x116B7, WBP_Extend},
{0x116C0, 0x116C9, WBP_Numeric},
{0x12000, 0x1236E, WBP_ALetter},
{0x12400, 0x12462, WBP_ALetter},
{0x13000, 0x1342E, WBP_ALetter},
{0x16800, 0x16A38, WBP_ALetter},
{0x16F00, 0x16F44, WBP_ALetter},
{0x16F50, 0x16F50, WBP_ALetter},
{0x16F51, 0x16F7E, WBP_Extend},
{0x16F8F, 0x16F92, WBP_Extend},
{0x16F93, 0x16F9F, WBP_ALetter},
{0x1B000, 0x1B000, WBP_Katakana},
{0x1D165, 0x1D166, WBP_Extend},
{0x1D167, 0x1D169, WBP_Extend},
@ -853,6 +908,40 @@ static struct WordBreakProperties wb_prop_default[] = {
{0x1D7AA, 0x1D7C2, WBP_ALetter},
{0x1D7C4, 0x1D7CB, WBP_ALetter},
{0x1D7CE, 0x1D7FF, WBP_Numeric},
{0x1EE00, 0x1EE03, WBP_ALetter},
{0x1EE05, 0x1EE1F, WBP_ALetter},
{0x1EE21, 0x1EE22, WBP_ALetter},
{0x1EE24, 0x1EE24, WBP_ALetter},
{0x1EE27, 0x1EE27, WBP_ALetter},
{0x1EE29, 0x1EE32, WBP_ALetter},
{0x1EE34, 0x1EE37, WBP_ALetter},
{0x1EE39, 0x1EE39, WBP_ALetter},
{0x1EE3B, 0x1EE3B, WBP_ALetter},
{0x1EE42, 0x1EE42, WBP_ALetter},
{0x1EE47, 0x1EE47, WBP_ALetter},
{0x1EE49, 0x1EE49, WBP_ALetter},
{0x1EE4B, 0x1EE4B, WBP_ALetter},
{0x1EE4D, 0x1EE4F, WBP_ALetter},
{0x1EE51, 0x1EE52, WBP_ALetter},
{0x1EE54, 0x1EE54, WBP_ALetter},
{0x1EE57, 0x1EE57, WBP_ALetter},
{0x1EE59, 0x1EE59, WBP_ALetter},
{0x1EE5B, 0x1EE5B, WBP_ALetter},
{0x1EE5D, 0x1EE5D, WBP_ALetter},
{0x1EE5F, 0x1EE5F, WBP_ALetter},
{0x1EE61, 0x1EE62, WBP_ALetter},
{0x1EE64, 0x1EE64, WBP_ALetter},
{0x1EE67, 0x1EE6A, WBP_ALetter},
{0x1EE6C, 0x1EE72, WBP_ALetter},
{0x1EE74, 0x1EE77, WBP_ALetter},
{0x1EE79, 0x1EE7C, WBP_ALetter},
{0x1EE7E, 0x1EE7E, WBP_ALetter},
{0x1EE80, 0x1EE89, WBP_ALetter},
{0x1EE8B, 0x1EE9B, WBP_ALetter},
{0x1EEA1, 0x1EEA3, WBP_ALetter},
{0x1EEA5, 0x1EEA9, WBP_ALetter},
{0x1EEAB, 0x1EEBB, WBP_ALetter},
{0x1F1E6, 0x1F1FF, WBP_Regional},
{0xE0001, 0xE0001, WBP_Format},
{0xE0020, 0xE007F, WBP_Format},
{0xE0100, 0xE01EF, WBP_Extend},

View File

@ -40,7 +40,7 @@
* Definitions of internal data structures, declarations of global
* variables, and function prototypes for the word breaking algorithm.
*
* @version 2.1, 2012/01/18
* @version 2.2, 2013/05/14
* @author Tom Hacohen
*/
@ -50,20 +50,21 @@
*/
enum WordBreakClass
{
WBP_Undefined,
WBP_CR,
WBP_LF,
WBP_Newline,
WBP_Extend,
WBP_Format,
WBP_Katakana,
WBP_ALetter,
WBP_MidNumLet,
WBP_MidLetter,
WBP_MidNum,
WBP_Numeric,
WBP_ExtendNumLet,
WBP_Any
WBP_Undefined,
WBP_CR,
WBP_LF,
WBP_Newline,
WBP_Extend,
WBP_Format,
WBP_Katakana,
WBP_ALetter,
WBP_MidNumLet,
WBP_MidLetter,
WBP_MidNum,
WBP_Numeric,
WBP_ExtendNumLet,
WBP_Regional,
WBP_Any
};
/**