summaryrefslogtreecommitdiff
path: root/src/static_libs/libunibreak/wordbreak.c
diff options
context:
space:
mode:
authorTom Hacohen <tom@stosb.com>2015-05-07 10:53:11 +0100
committerTom Hacohen <tom@stosb.com>2015-05-07 10:54:26 +0100
commit7a49d23f90f41c478db9b7beb9763aa0cd74ae46 (patch)
tree8ff009d70d91523d2c5f718ed395c8e23df43c4f /src/static_libs/libunibreak/wordbreak.c
parentba77a837a37af0d154d7ceafbb5ab7d4f75090f6 (diff)
Static deps unibreak: update to what will soon be version 3.
Version 3 is not yet released, but this is on track to become it. This is based on commit: a815e11f7ebf35b59278f783227a829ee4692760. @feature.
Diffstat (limited to 'src/static_libs/libunibreak/wordbreak.c')
-rw-r--r--src/static_libs/libunibreak/wordbreak.c76
1 files changed, 58 insertions, 18 deletions
diff --git a/src/static_libs/libunibreak/wordbreak.c b/src/static_libs/libunibreak/wordbreak.c
index 5c1e3d0e79..d7d5a42b02 100644
--- a/src/static_libs/libunibreak/wordbreak.c
+++ b/src/static_libs/libunibreak/wordbreak.c
@@ -4,7 +4,7 @@
4 * Word breaking in a Unicode sequence. Designed to be used in a 4 * Word breaking in a Unicode sequence. Designed to be used in a
5 * generic text renderer. 5 * generic text renderer.
6 * 6 *
7 * Copyright (C) 2013 Tom Hacohen <tom at stosb dot com> 7 * Copyright (C) 2013-2015 Tom Hacohen <tom at stosb dot com>
8 * 8 *
9 * This software is provided 'as-is', without any express or implied 9 * This software is provided 'as-is', without any express or implied
10 * warranty. In no event will the author be held liable for any damages 10 * warranty. In no event will the author be held liable for any damages
@@ -30,9 +30,9 @@
30 * Unicode 6.0.0: 30 * Unicode 6.0.0:
31 * <URL:http://www.unicode.org/reports/tr29/tr29-17.html> 31 * <URL:http://www.unicode.org/reports/tr29/tr29-17.html>
32 * 32 *
33 * This library has been updated according to Revision 21, for 33 * This library has been updated according to Revision 25, for
34 * Unicode 6.2.0: 34 * Unicode 7.0.0:
35 * <URL:http://www.unicode.org/reports/tr29/tr29-21.html> 35 * <URL:http://www.unicode.org/reports/tr29/tr29-25.html>
36 * 36 *
37 * The Unicode Terms of Use are available at 37 * The Unicode Terms of Use are available at
38 * <URL:http://www.unicode.org/copyright.html> 38 * <URL:http://www.unicode.org/copyright.html>
@@ -44,16 +44,14 @@
44 * Implementation of the word breaking algorithm as described in Unicode 44 * Implementation of the word breaking algorithm as described in Unicode
45 * Standard Annex 29. 45 * Standard Annex 29.
46 * 46 *
47 * @version 2.4, 2013/09/28 47 * @version 2.6, 2015/04/18
48 * @author Tom Hacohen 48 * @author Tom Hacohen
49 */ 49 */
50 50
51#include <assert.h> 51#include <assert.h>
52#include <stddef.h> 52#include <stddef.h>
53#include <string.h> 53#include <string.h>
54#include "linebreak.h" 54#include "unibreakdef.h"
55#include "linebreakdef.h"
56
57#include "wordbreak.h" 55#include "wordbreak.h"
58#include "wordbreakdata.c" 56#include "wordbreakdata.c"
59 57
@@ -128,7 +126,6 @@ static void set_brks_to(
128 while (posNext < posEnd) 126 while (posNext < posEnd)
129 { 127 {
130 utf32_t ch; 128 utf32_t ch;
131 (void)ch;
132 ch = get_next_char(s, len, &posNext); 129 ch = get_next_char(s, len, &posNext);
133 assert(ch != EOS); 130 assert(ch != EOS);
134 for (; posStart < posNext - 1; ++posStart) 131 for (; posStart < posNext - 1; ++posStart)
@@ -257,8 +254,24 @@ static void set_wordbreaks(
257 posLast = posCur; 254 posLast = posCur;
258 break; 255 break;
259 256
257 case WBP_Hebrew_Letter:
260 case WBP_ALetter: 258 case WBP_ALetter:
261 if ((wbcSeqStart == WBP_ALetter) || /* WB5,6,7 */ 259 if ((wbcSeqStart == WBP_Hebrew_Letter) &&
260 (wbcLast == WBP_Double_Quote)) /* WB7b,c */
261 {
262 if (wbcCur == WBP_Hebrew_Letter)
263 {
264 set_brks_to(s, brks, posLast, posCur, len,
265 WORDBREAK_NOBREAK, get_next_char);
266 }
267 else
268 {
269 set_brks_to(s, brks, posLast, posCur, len,
270 WORDBREAK_BREAK, get_next_char);
271 }
272 }
273 else if (((wbcSeqStart == WBP_ALetter) ||
274 (wbcSeqStart == WBP_Hebrew_Letter)) || /* WB5,6,7 */
262 (wbcLast == WBP_Numeric) || /* WB10 */ 275 (wbcLast == WBP_Numeric) || /* WB10 */
263 (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */ 276 (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */
264 { 277 {
@@ -275,8 +288,18 @@ static void set_wordbreaks(
275 posLast = posCur; 288 posLast = posCur;
276 break; 289 break;
277 290
291 case WBP_Single_Quote:
292 if (wbcLast == WBP_Hebrew_Letter) /* WB7a */
293 {
294 set_brks_to(s, brks, posLast, posCur, len,
295 WORDBREAK_NOBREAK, get_next_char);
296 wbcSeqStart = wbcCur;
297 posLast = posCur;
298 }
299 /* No break on purpose */
278 case WBP_MidNumLet: 300 case WBP_MidNumLet:
279 if ((wbcLast == WBP_ALetter) || /* WB6,7 */ 301 if (((wbcLast == WBP_ALetter) ||
302 (wbcLast == WBP_Hebrew_Letter)) || /* WB6,7 */
280 (wbcLast == WBP_Numeric)) /* WB11,12 */ 303 (wbcLast == WBP_Numeric)) /* WB11,12 */
281 { 304 {
282 /* Go on */ 305 /* Go on */
@@ -291,7 +314,8 @@ static void set_wordbreaks(
291 break; 314 break;
292 315
293 case WBP_MidLetter: 316 case WBP_MidLetter:
294 if (wbcLast == WBP_ALetter) /* WB6,7 */ 317 if ((wbcLast == WBP_ALetter) ||
318 (wbcLast == WBP_Hebrew_Letter)) /* WB6,7 */
295 { 319 {
296 /* Go on */ 320 /* Go on */
297 } 321 }
@@ -320,7 +344,8 @@ static void set_wordbreaks(
320 344
321 case WBP_Numeric: 345 case WBP_Numeric:
322 if ((wbcSeqStart == WBP_Numeric) || /* WB8,11,12 */ 346 if ((wbcSeqStart == WBP_Numeric) || /* WB8,11,12 */
323 (wbcLast == WBP_ALetter) || /* WB9 */ 347 ((wbcLast == WBP_ALetter) ||
348 (wbcLast == WBP_Hebrew_Letter)) || /* WB9 */
324 (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */ 349 (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */
325 { 350 {
326 set_brks_to(s, brks, posLast, posCur, len, 351 set_brks_to(s, brks, posLast, posCur, len,
@@ -340,6 +365,7 @@ static void set_wordbreaks(
340 /* WB13a,13b */ 365 /* WB13a,13b */
341 if ((wbcSeqStart == wbcLast) && 366 if ((wbcSeqStart == wbcLast) &&
342 ((wbcLast == WBP_ALetter) || 367 ((wbcLast == WBP_ALetter) ||
368 (wbcLast == WBP_Hebrew_Letter) ||
343 (wbcLast == WBP_Numeric) || 369 (wbcLast == WBP_Numeric) ||
344 (wbcLast == WBP_Katakana) || 370 (wbcLast == WBP_Katakana) ||
345 (wbcLast == WBP_ExtendNumLet))) 371 (wbcLast == WBP_ExtendNumLet)))
@@ -357,9 +383,9 @@ static void set_wordbreaks(
357 posLast = posCur; 383 posLast = posCur;
358 break; 384 break;
359 385
360 case WBP_Regional: 386 case WBP_Regional_Indicator:
361 /* WB13c */ 387 /* WB13c */
362 if (wbcSeqStart == WBP_Regional) 388 if (wbcSeqStart == WBP_Regional_Indicator)
363 { 389 {
364 set_brks_to(s, brks, posLast, posCur, len, 390 set_brks_to(s, brks, posLast, posCur, len,
365 WORDBREAK_NOBREAK, get_next_char); 391 WORDBREAK_NOBREAK, get_next_char);
@@ -368,6 +394,20 @@ static void set_wordbreaks(
368 posLast = posCur; 394 posLast = posCur;
369 break; 395 break;
370 396
397 case WBP_Double_Quote:
398 if (wbcLast == WBP_Hebrew_Letter) /* WB7b,c */
399 {
400 /* Go on */
401 }
402 else
403 {
404 set_brks_to(s, brks, posLast, posCur, len,
405 WORDBREAK_BREAK, get_next_char);
406 wbcSeqStart = wbcCur;
407 posLast = posCur;
408 }
409 break;
410
371 case WBP_Any: 411 case WBP_Any:
372 /* Allow breaks and reset */ 412 /* Allow breaks and reset */
373 set_brks_to(s, brks, posLast, posCur, len, 413 set_brks_to(s, brks, posLast, posCur, len,
@@ -409,7 +449,7 @@ void set_wordbreaks_utf8(
409 char *brks) 449 char *brks)
410{ 450{
411 set_wordbreaks(s, len, lang, brks, 451 set_wordbreaks(s, len, lang, brks,
412 (get_next_char_t)lb_get_next_char_utf8); 452 (get_next_char_t)ub_get_next_char_utf8);
413} 453}
414 454
415/** 455/**
@@ -429,7 +469,7 @@ void set_wordbreaks_utf16(
429 char *brks) 469 char *brks)
430{ 470{
431 set_wordbreaks(s, len, lang, brks, 471 set_wordbreaks(s, len, lang, brks,
432 (get_next_char_t)lb_get_next_char_utf16); 472 (get_next_char_t)ub_get_next_char_utf16);
433} 473}
434 474
435/** 475/**
@@ -449,5 +489,5 @@ void set_wordbreaks_utf32(
449 char *brks) 489 char *brks)
450{ 490{
451 set_wordbreaks(s, len, lang, brks, 491 set_wordbreaks(s, len, lang, brks,
452 (get_next_char_t)lb_get_next_char_utf32); 492 (get_next_char_t)ub_get_next_char_utf32);
453} 493}