summaryrefslogtreecommitdiff
path: root/src/static_libs/libunibreak/linebreak.c
diff options
context:
space:
mode:
authorTom Hacohen <tom@stosb.com>2015-05-07 10:53:11 +0100
committerTom Hacohen <tom@stosb.com>2015-05-07 10:54:26 +0100
commit7a49d23f90f41c478db9b7beb9763aa0cd74ae46 (patch)
tree8ff009d70d91523d2c5f718ed395c8e23df43c4f /src/static_libs/libunibreak/linebreak.c
parentba77a837a37af0d154d7ceafbb5ab7d4f75090f6 (diff)
Static deps unibreak: update to what will soon be version 3.
Version 3 is not yet released, but this is on track to become it. This is based on commit: a815e11f7ebf35b59278f783227a829ee4692760. @feature.
Diffstat (limited to 'src/static_libs/libunibreak/linebreak.c')
-rw-r--r--src/static_libs/libunibreak/linebreak.c179
1 files changed, 34 insertions, 145 deletions
diff --git a/src/static_libs/libunibreak/linebreak.c b/src/static_libs/libunibreak/linebreak.c
index 9716df4860..7c8ff9ed19 100644
--- a/src/static_libs/libunibreak/linebreak.c
+++ b/src/static_libs/libunibreak/linebreak.c
@@ -4,7 +4,7 @@
4 * Line breaking in a Unicode sequence. Designed to be used in a 4 * Line breaking in a Unicode sequence. Designed to be used in a
5 * generic text renderer. 5 * generic text renderer.
6 * 6 *
7 * Copyright (C) 2008-2013 Wu Yongwei <wuyongwei at gmail dot com> 7 * Copyright (C) 2008-2015 Wu Yongwei <wuyongwei at gmail dot com>
8 * Copyright (C) 2013 Petr Filipsky <philodej at gmail dot com> 8 * Copyright (C) 2013 Petr Filipsky <philodej at gmail dot com>
9 * 9 *
10 * This software is provided 'as-is', without any express or implied 10 * This software is provided 'as-is', without any express or implied
@@ -31,9 +31,9 @@
31 * Unicode 5.0.0: 31 * Unicode 5.0.0:
32 * <URL:http://www.unicode.org/reports/tr14/tr14-19.html> 32 * <URL:http://www.unicode.org/reports/tr14/tr14-19.html>
33 * 33 *
34 * This library has been updated according to Revision 30, for 34 * This library has been updated according to Revision 33, for
35 * Unicode 6.2.0: 35 * Unicode 7.0.0:
36 * <URL:http://www.unicode.org/reports/tr14/tr14-30.html> 36 * <URL:http://www.unicode.org/reports/tr14/tr14-33.html>
37 * 37 *
38 * The Unicode Terms of Use are available at 38 * The Unicode Terms of Use are available at
39 * <URL:http://www.unicode.org/copyright.html> 39 * <URL:http://www.unicode.org/copyright.html>
@@ -45,7 +45,7 @@
45 * Implementation of the line breaking algorithm as described in Unicode 45 * Implementation of the line breaking algorithm as described in Unicode
46 * Standard Annex 14. 46 * Standard Annex 14.
47 * 47 *
48 * @version 2.5, 2013/11/14 48 * @version 2.7, 2015/04/18
49 * @author Wu Yongwei 49 * @author Wu Yongwei
50 * @author Petr Filipsky 50 * @author Petr Filipsky
51 */ 51 */
@@ -67,11 +67,6 @@
67#define LINEBREAK_INDEX_SIZE 40 67#define LINEBREAK_INDEX_SIZE 40
68 68
69/** 69/**
70 * Version number of the library.
71 */
72const int linebreak_version = LINEBREAK_VERSION;
73
74/**
75 * Enumeration of break actions. They are used in the break action 70 * Enumeration of break actions. They are used in the break action
76 * pair table below. 71 * pair table below.
77 */ 72 */
@@ -451,7 +446,7 @@ static enum LineBreakClass resolve_lb_class(
451 * @post \a lbpCtx->lbcCur has the updated line break class 446 * @post \a lbpCtx->lbcCur has the updated line break class
452 */ 447 */
453static void treat_first_char( 448static void treat_first_char(
454 struct LineBreakContext* lbpCtx) 449 struct LineBreakContext *lbpCtx)
455{ 450{
456 switch (lbpCtx->lbcCur) 451 switch (lbpCtx->lbcCur)
457 { 452 {
@@ -465,6 +460,8 @@ static void treat_first_char(
465 case LBP_SP: 460 case LBP_SP:
466 lbpCtx->lbcCur = LBP_WJ; /* Leading space treated as WJ */ 461 lbpCtx->lbcCur = LBP_WJ; /* Leading space treated as WJ */
467 break; 462 break;
463 case LBP_HL:
464 lbpCtx->fLb21aHebrew = 1; /* Rule LB21a */
468 default: 465 default:
469 break; 466 break;
470 } 467 }
@@ -485,7 +482,7 @@ static void treat_first_char(
485 * table lookup is needed 482 * table lookup is needed
486 */ 483 */
487static int get_lb_result_simple( 484static int get_lb_result_simple(
488 struct LineBreakContext* lbpCtx) 485 struct LineBreakContext *lbpCtx)
489{ 486{
490 if (lbpCtx->lbcCur == LBP_BK 487 if (lbpCtx->lbcCur == LBP_BK
491 || (lbpCtx->lbcCur == LBP_CR && lbpCtx->lbcNew != LBP_LF)) 488 || (lbpCtx->lbcCur == LBP_CR && lbpCtx->lbcNew != LBP_LF))
@@ -528,13 +525,12 @@ static int get_lb_result_simple(
528 * #LINEBREAK_ALLOWBREAK, and #LINEBREAK_NOBREAK 525 * #LINEBREAK_ALLOWBREAK, and #LINEBREAK_NOBREAK
529 */ 526 */
530static int get_lb_result_lookup( 527static int get_lb_result_lookup(
531 struct LineBreakContext* lbpCtx) 528 struct LineBreakContext *lbpCtx)
532{ 529{
533 /* TODO: Rule LB21a, as introduced by Revision 28 of UAX#14, is not
534 * yet implemented below. */
535 int brk = LINEBREAK_UNDEFINED; 530 int brk = LINEBREAK_UNDEFINED;
536 assert(lbpCtx->lbcCur <= LBP_JT); 531
537 assert(lbpCtx->lbcNew <= LBP_JT); 532 assert(lbpCtx->lbcCur <= LBP_RI);
533 assert(lbpCtx->lbcNew <= LBP_RI);
538 switch (baTable[lbpCtx->lbcCur - 1][lbpCtx->lbcNew - 1]) 534 switch (baTable[lbpCtx->lbcCur - 1][lbpCtx->lbcNew - 1])
539 { 535 {
540 case DIR_BRK: 536 case DIR_BRK:
@@ -555,6 +551,19 @@ static int get_lb_result_lookup(
555 brk = LINEBREAK_NOBREAK; 551 brk = LINEBREAK_NOBREAK;
556 break; 552 break;
557 } 553 }
554
555 /* Special processing due to rule LB21a */
556 if (lbpCtx->fLb21aHebrew &&
557 (lbpCtx->lbcCur == LBP_HY || lbpCtx->lbcCur == LBP_BA))
558 {
559 brk = LINEBREAK_NOBREAK;
560 lbpCtx->fLb21aHebrew = 0;
561 }
562 else if (!(lbpCtx->lbcNew == LBP_HY || lbpCtx->lbcNew == LBP_BA))
563 {
564 lbpCtx->fLb21aHebrew = (lbpCtx->lbcNew == LBP_HL);
565 }
566
558 lbpCtx->lbcCur = lbpCtx->lbcNew; 567 lbpCtx->lbcCur = lbpCtx->lbcNew;
559 return brk; 568 return brk;
560} 569}
@@ -568,9 +577,9 @@ static int get_lb_result_lookup(
568 * @post the line breaking context is initialized 577 * @post the line breaking context is initialized
569 */ 578 */
570void lb_init_break_context( 579void lb_init_break_context(
571 struct LineBreakContext* lbpCtx, 580 struct LineBreakContext *lbpCtx,
572 utf32_t ch, 581 utf32_t ch,
573 const char* lang) 582 const char *lang)
574{ 583{
575 lbpCtx->lang = lang; 584 lbpCtx->lang = lang;
576 lbpCtx->lbpLang = get_lb_prop_lang(lang); 585 lbpCtx->lbpLang = get_lb_prop_lang(lang);
@@ -579,6 +588,7 @@ void lb_init_break_context(
579 lbpCtx->lbcCur = resolve_lb_class( 588 lbpCtx->lbcCur = resolve_lb_class(
580 get_char_lb_class_lang(ch, lbpCtx->lbpLang), 589 get_char_lb_class_lang(ch, lbpCtx->lbpLang),
581 lbpCtx->lang); 590 lbpCtx->lang);
591 lbpCtx->fLb21aHebrew = 0;
582 treat_first_char(lbpCtx); 592 treat_first_char(lbpCtx);
583} 593}
584 594
@@ -593,7 +603,7 @@ void lb_init_break_context(
593 * @post the line breaking context is updated 603 * @post the line breaking context is updated
594 */ 604 */
595int lb_process_next_char( 605int lb_process_next_char(
596 struct LineBreakContext* lbpCtx, 606 struct LineBreakContext *lbpCtx,
597 utf32_t ch ) 607 utf32_t ch )
598{ 608{
599 int brk; 609 int brk;
@@ -618,127 +628,6 @@ int lb_process_next_char(
618} 628}
619 629
620/** 630/**
621 * Gets the next Unicode character in a UTF-8 sequence. The index will
622 * be advanced to the next complete character, unless the end of string
623 * is reached in the middle of a UTF-8 sequence.
624 *
625 * @param[in] s input UTF-8 string
626 * @param[in] len length of the string in bytes
627 * @param[in,out] ip pointer to the index
628 * @return the Unicode character beginning at the index; or
629 * #EOS if end of input is encountered
630 */
631utf32_t lb_get_next_char_utf8(
632 const utf8_t *s,
633 size_t len,
634 size_t *ip)
635{
636 utf8_t ch;
637 utf32_t res;
638
639 assert(*ip <= len);
640 if (*ip == len)
641 return EOS;
642 ch = s[*ip];
643
644 if (ch < 0xC2 || ch > 0xF4)
645 { /* One-byte sequence, tail (should not occur), or invalid */
646 *ip += 1;
647 return ch;
648 }
649 else if (ch < 0xE0)
650 { /* Two-byte sequence */
651 if (*ip + 2 > len)
652 return EOS;
653 res = ((ch & 0x1F) << 6) + (s[*ip + 1] & 0x3F);
654 *ip += 2;
655 return res;
656 }
657 else if (ch < 0xF0)
658 { /* Three-byte sequence */
659 if (*ip + 3 > len)
660 return EOS;
661 res = ((ch & 0x0F) << 12) +
662 ((s[*ip + 1] & 0x3F) << 6) +
663 ((s[*ip + 2] & 0x3F));
664 *ip += 3;
665 return res;
666 }
667 else
668 { /* Four-byte sequence */
669 if (*ip + 4 > len)
670 return EOS;
671 res = ((ch & 0x07) << 18) +
672 ((s[*ip + 1] & 0x3F) << 12) +
673 ((s[*ip + 2] & 0x3F) << 6) +
674 ((s[*ip + 3] & 0x3F));
675 *ip += 4;
676 return res;
677 }
678}
679
680/**
681 * Gets the next Unicode character in a UTF-16 sequence. The index will
682 * be advanced to the next complete character, unless the end of string
683 * is reached in the middle of a UTF-16 surrogate pair.
684 *
685 * @param[in] s input UTF-16 string
686 * @param[in] len length of the string in words
687 * @param[in,out] ip pointer to the index
688 * @return the Unicode character beginning at the index; or
689 * #EOS if end of input is encountered
690 */
691utf32_t lb_get_next_char_utf16(
692 const utf16_t *s,
693 size_t len,
694 size_t *ip)
695{
696 utf16_t ch;
697
698 assert(*ip <= len);
699 if (*ip == len)
700 return EOS;
701 ch = s[(*ip)++];
702
703 if (ch < 0xD800 || ch > 0xDBFF)
704 { /* If the character is not a high surrogate */
705 return ch;
706 }
707 if (*ip == len)
708 { /* If the input ends here (an error) */
709 --(*ip);
710 return EOS;
711 }
712 if (s[*ip] < 0xDC00 || s[*ip] > 0xDFFF)
713 { /* If the next character is not the low surrogate (an error) */
714 return ch;
715 }
716 /* Return the constructed character and advance the index again */
717 return (((utf32_t)ch & 0x3FF) << 10) + (s[(*ip)++] & 0x3FF) + 0x10000;
718}
719
720/**
721 * Gets the next Unicode character in a UTF-32 sequence. The index will
722 * be advanced to the next character.
723 *
724 * @param[in] s input UTF-32 string
725 * @param[in] len length of the string in dwords
726 * @param[in,out] ip pointer to the index
727 * @return the Unicode character beginning at the index; or
728 * #EOS if end of input is encountered
729 */
730utf32_t lb_get_next_char_utf32(
731 const utf32_t *s,
732 size_t len,
733 size_t *ip)
734{
735 assert(*ip <= len);
736 if (*ip == len)
737 return EOS;
738 return s[(*ip)++];
739}
740
741/**
742 * Sets the line breaking information for a generic input string. 631 * Sets the line breaking information for a generic input string.
743 * 632 *
744 * @param[in] s input string 633 * @param[in] s input string
@@ -809,7 +698,7 @@ void set_linebreaks_utf8(
809 char *brks) 698 char *brks)
810{ 699{
811 set_linebreaks(s, len, lang, brks, 700 set_linebreaks(s, len, lang, brks,
812 (get_next_char_t)lb_get_next_char_utf8); 701 (get_next_char_t)ub_get_next_char_utf8);
813} 702}
814 703
815/** 704/**
@@ -829,7 +718,7 @@ void set_linebreaks_utf16(
829 char *brks) 718 char *brks)
830{ 719{
831 set_linebreaks(s, len, lang, brks, 720 set_linebreaks(s, len, lang, brks,
832 (get_next_char_t)lb_get_next_char_utf16); 721 (get_next_char_t)ub_get_next_char_utf16);
833} 722}
834 723
835/** 724/**
@@ -849,7 +738,7 @@ void set_linebreaks_utf32(
849 char *brks) 738 char *brks)
850{ 739{
851 set_linebreaks(s, len, lang, brks, 740 set_linebreaks(s, len, lang, brks,
852 (get_next_char_t)lb_get_next_char_utf32); 741 (get_next_char_t)ub_get_next_char_utf32);
853} 742}
854 743
855/** 744/**
@@ -868,7 +757,7 @@ void set_linebreaks_utf32(
868int is_line_breakable( 757int is_line_breakable(
869 utf32_t char1, 758 utf32_t char1,
870 utf32_t char2, 759 utf32_t char2,
871 const char* lang) 760 const char *lang)
872{ 761{
873 utf32_t s[2]; 762 utf32_t s[2];
874 char brks[2]; 763 char brks[2];