/* vim: set expandtab tabstop=4 softtabstop=4 shiftwidth=4: */ /* * Word breaking in a Unicode sequence. Designed to be used in a * generic text renderer. * * Copyright (C) 2013-2016 Tom Hacohen * * This software is provided 'as-is', without any express or implied * warranty. In no event will the author be held liable for any damages * arising from the use of this software. * * Permission is granted to anyone to use this software for any purpose, * including commercial applications, and to alter it and redistribute * it freely, subject to the following restrictions: * * 1. The origin of this software must not be misrepresented; you must * not claim that you wrote the original software. If you use this * software in a product, an acknowledgement in the product * documentation would be appreciated but is not required. * 2. Altered source versions must be plainly marked as such, and must * not be misrepresented as being the original software. * 3. This notice may not be removed or altered from any source * distribution. * * The main reference is Unicode Standard Annex 29 (UAX #29): * * * When this library was designed, this annex was at Revision 17, for * Unicode 6.0.0: * * * This library has been updated according to Revision 29, for * Unicode 9.0.0: * * * The Unicode Terms of Use are available at * */ /** * @file wordbreak.c * * Implementation of the word breaking algorithm as described in Unicode * Standard Annex 29. * * @author Tom Hacohen */ #include #include #include #include "unibreakdef.h" #include "wordbreak.h" #include "wordbreakdata.c" #define ARRAY_LEN(x) (sizeof(x) / sizeof(x[0])) /** * Initializes the wordbreak internals. It currently does nothing, but * it may in the future. */ void init_wordbreak(void) { } /** * Gets the word breaking class of a character. * * @param ch character to check * @param wbp pointer to the wbp breaking properties array * @param len size of the wbp array in number of items * @return the word breaking class if found; \c WBP_Any otherwise */ static enum WordBreakClass get_char_wb_class( utf32_t ch, const struct WordBreakProperties *wbp, size_t len) { int min = 0; int max = len - 1; int mid; do { mid = (min + max) / 2; if (ch < wbp[mid].start) max = mid - 1; else if (ch > wbp[mid].end) min = mid + 1; else return wbp[mid].prop; } while (min <= max); return WBP_Any; } /** * Sets the word break types to a specific value in a range. * * It sets the inside chars to #WORDBREAK_INSIDEACHAR and the rest to brkType. * Assumes \a brks is initialized - all the cells with #WORDBREAK_NOBREAK are * cells that we really don't want to break after. * * @param[in] s input string * @param[out] brks breaks array to fill * @param[in] posStart start position * @param[in] posEnd end position (exclusive) * @param[in] len length of the string * @param[in] brkType breaks type to use * @param[in] get_next_char function to get the next UTF-32 character */ static void set_brks_to( const void *s, char *brks, size_t posStart, size_t posEnd, size_t len, char brkType, get_next_char_t get_next_char) { size_t posNext = posStart; while (posNext < posEnd) { utf32_t ch; ch = get_next_char(s, len, &posNext); (void)ch; assert(ch != EOS); for (; posStart < posNext - 1; ++posStart) brks[posStart] = WORDBREAK_INSIDEACHAR; assert(posStart == posNext - 1); /* Only set it if we haven't set it not to break before. */ if (brks[posStart] != WORDBREAK_NOBREAK) brks[posStart] = brkType; posStart = posNext; } } /* Checks to see if the class is newline, CR, or LF (rules WB3a and b). */ #define IS_WB3ab(cls) ((cls == WBP_Newline) || (cls == WBP_CR) || \ (cls == WBP_LF)) /** * Sets the word breaking information for a generic input string. * * @param[in] s input string * @param[in] len length of the input * @param[in] lang language of the input (reserved for future use) * @param[out] brks pointer to the output breaking data, containing * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or * #WORDBREAK_INSIDEACHAR * @param[in] get_next_char function to get the next UTF-32 character */ static void set_wordbreaks( const void *s, size_t len, const char *lang, char *brks, get_next_char_t get_next_char) { /* Counter of how many time we cam across RI */ int riCounter = 0; enum WordBreakClass wbcLast = WBP_Undefined; /* wbcSeqStart is the class that started the current sequence. * WBP_Undefined is a special case that means "sot". * This value is the class that is at the start of the current rule * matching sequence. For example, in case of Numeric+MidNum+Numeric * it'll be Numeric all the way. */ enum WordBreakClass wbcSeqStart = WBP_Undefined; utf32_t ch; size_t posNext = 0; size_t posCur = 0; size_t posLast = 0; /* TODO: Language-specific specialization. */ (void) lang; /* Init brks. */ memset(brks, WORDBREAK_BREAK, len); ch = get_next_char(s, len, &posNext); while (ch != EOS) { enum WordBreakClass wbcCur; wbcCur = get_char_wb_class(ch, wb_prop_default, ARRAY_LEN(wb_prop_default)); switch (wbcCur) { case WBP_CR: /* WB3b */ set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_BREAK, get_next_char); wbcSeqStart = wbcCur; posLast = posCur; break; case WBP_LF: if (wbcSeqStart == WBP_CR) /* WB3 */ { set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_NOBREAK, get_next_char); wbcSeqStart = wbcCur; posLast = posCur; break; } #ifndef __has_attribute # define __has_attribute(x) 0 #endif #if __has_attribute(fallthrough) __attribute__((fallthrough)); #endif /* Fall off */ case WBP_Newline: /* WB3a,3b */ set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_BREAK, get_next_char); wbcSeqStart = wbcCur; posLast = posCur; break; case WBP_E_Base_GAZ: case WBP_Glue_After_Zwj: /* WB3c */ if (wbcLast == WBP_ZWJ) { set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_NOBREAK, get_next_char); } /* No rule found, reset */ else { set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_BREAK, get_next_char); } wbcSeqStart = wbcCur; posLast = posCur; break; case WBP_ZWJ: case WBP_Extend: case WBP_Format: /* WB4 - If not the first char/after a newline (WB3a,3b), skip * this class, set it to be the same as the prev, and mark * brks not to break before them. */ if ((wbcSeqStart == WBP_Undefined) || IS_WB3ab(wbcSeqStart)) { set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_BREAK, get_next_char); wbcSeqStart = wbcCur; posLast = posCur; } else { /* It's surely not the first */ brks[posCur - 1] = WORDBREAK_NOBREAK; /* WB3c precedes 4, so no intervening Extend chars allowed. */ if (wbcSeqStart != WBP_ZWJ) { /* "inherit" the previous class. */ wbcCur = wbcLast; } } break; case WBP_Katakana: if ((wbcSeqStart == WBP_Katakana) || /* WB13 */ (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */ { set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_NOBREAK, get_next_char); } /* No rule found, reset */ else { set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_BREAK, get_next_char); } wbcSeqStart = wbcCur; posLast = posCur; break; case WBP_Hebrew_Letter: case WBP_ALetter: if ((wbcSeqStart == WBP_Hebrew_Letter) && (wbcLast == WBP_Double_Quote)) /* WB7b,c */ { if (wbcCur == WBP_Hebrew_Letter) { set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_NOBREAK, get_next_char); } else { set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_BREAK, get_next_char); } } else if (((wbcSeqStart == WBP_ALetter) || (wbcSeqStart == WBP_Hebrew_Letter)) || /* WB5,6,7 */ (wbcLast == WBP_Numeric) || /* WB10 */ (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */ { set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_NOBREAK, get_next_char); } /* No rule found, reset */ else { set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_BREAK, get_next_char); } wbcSeqStart = wbcCur; posLast = posCur; break; case WBP_Single_Quote: if (wbcLast == WBP_Hebrew_Letter) /* WB7a */ { set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_NOBREAK, get_next_char); wbcSeqStart = wbcCur; posLast = posCur; } #ifndef __has_attribute # define __has_attribute(x) 0 #endif #if __has_attribute(fallthrough) __attribute__((fallthrough)); #endif /* No break on purpose */ case WBP_MidNumLet: if (((wbcLast == WBP_ALetter) || (wbcLast == WBP_Hebrew_Letter)) || /* WB6,7 */ (wbcLast == WBP_Numeric)) /* WB11,12 */ { /* Go on */ } else { set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_BREAK, get_next_char); wbcSeqStart = wbcCur; posLast = posCur; } break; case WBP_MidLetter: if ((wbcLast == WBP_ALetter) || (wbcLast == WBP_Hebrew_Letter)) /* WB6,7 */ { /* Go on */ } else { set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_BREAK, get_next_char); wbcSeqStart = wbcCur; posLast = posCur; } break; case WBP_MidNum: if (wbcLast == WBP_Numeric) /* WB11,12 */ { /* Go on */ } else { set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_BREAK, get_next_char); wbcSeqStart = wbcCur; posLast = posCur; } break; case WBP_Numeric: if ((wbcSeqStart == WBP_Numeric) || /* WB8,11,12 */ ((wbcLast == WBP_ALetter) || (wbcLast == WBP_Hebrew_Letter)) || /* WB9 */ (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */ { set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_NOBREAK, get_next_char); } /* No rule found, reset */ else { set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_BREAK, get_next_char); } wbcSeqStart = wbcCur; posLast = posCur; break; case WBP_ExtendNumLet: /* WB13a,13b */ if ((wbcSeqStart == wbcLast) && ((wbcLast == WBP_ALetter) || (wbcLast == WBP_Hebrew_Letter) || (wbcLast == WBP_Numeric) || (wbcLast == WBP_Katakana) || (wbcLast == WBP_ExtendNumLet))) { set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_NOBREAK, get_next_char); } /* No rule found, reset */ else { set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_BREAK, get_next_char); } wbcSeqStart = wbcCur; posLast = posCur; break; case WBP_E_Base: /* No rule found, reset */ set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_BREAK, get_next_char); wbcSeqStart = wbcCur; posLast = posCur; break; case WBP_E_Modifier: /* WB14 */ if ((wbcLast == WBP_E_Base) || (wbcLast == WBP_E_Base_GAZ)) { set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_NOBREAK, get_next_char); } /* No rule found, reset */ else { set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_BREAK, get_next_char); } wbcSeqStart = wbcCur; posLast = posCur; break; case WBP_Regional_Indicator: /* WB15,16 */ if ((wbcSeqStart == WBP_Regional_Indicator) && ((riCounter % 2) == 1)) { set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_NOBREAK, get_next_char); riCounter = 0; /* Reset the sequence */ } /* No rule found, reset */ else { set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_BREAK, get_next_char); riCounter = 1; } wbcSeqStart = wbcCur; posLast = posCur; break; case WBP_Double_Quote: if (wbcLast == WBP_Hebrew_Letter) /* WB7b,c */ { /* Go on */ } else { set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_BREAK, get_next_char); wbcSeqStart = wbcCur; posLast = posCur; } break; case WBP_Any: /* Allow breaks and reset */ set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_BREAK, get_next_char); wbcSeqStart = wbcCur; posLast = posCur; break; default: /* Error, should never get here! */ assert(0); break; } wbcLast = wbcCur; posCur = posNext; ch = get_next_char(s, len, &posNext); } /* WB2 */ set_brks_to(s, brks, posLast, posNext, len, WORDBREAK_BREAK, get_next_char); } /** * Sets the word breaking information for a UTF-8 input string. * * @param[in] s input UTF-8 string * @param[in] len length of the input * @param[in] lang language of the input (reserved for future use) * @param[out] brks pointer to the output breaking data, containing * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or * #WORDBREAK_INSIDEACHAR */ void set_wordbreaks_utf8( const utf8_t *s, size_t len, const char *lang, char *brks) { set_wordbreaks(s, len, lang, brks, (get_next_char_t)ub_get_next_char_utf8); } /** * Sets the word breaking information for a UTF-16 input string. * * @param[in] s input UTF-16 string * @param[in] len length of the input * @param[in] lang language of the input (reserved for future use) * @param[out] brks pointer to the output breaking data, containing * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or * #WORDBREAK_INSIDEACHAR */ void set_wordbreaks_utf16( const utf16_t *s, size_t len, const char *lang, char *brks) { set_wordbreaks(s, len, lang, brks, (get_next_char_t)ub_get_next_char_utf16); } /** * Sets the word breaking information for a UTF-32 input string. * * @param[in] s input UTF-32 string * @param[in] len length of the input * @param[in] lang language of the input (reserved for future use) * @param[out] brks pointer to the output breaking data, containing * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or * #WORDBREAK_INSIDEACHAR */ void set_wordbreaks_utf32( const utf32_t *s, size_t len, const char *lang, char *brks) { set_wordbreaks(s, len, lang, brks, (get_next_char_t)ub_get_next_char_utf32); }