From cff1a9a59f40b1e83ed1db8145108cae53504d4f Mon Sep 17 00:00:00 2001 From: Tom Hacohen Date: Tue, 21 Jan 2014 16:41:06 +0000 Subject: [PATCH] Synced libunibreak local copy with upstream. This fixes T805. --- src/static_libs/libunibreak/AUTHORS | 5 +- src/static_libs/libunibreak/ChangeLog | 118 +- src/static_libs/libunibreak/NEWS | 7 + src/static_libs/libunibreak/README | 79 +- src/static_libs/libunibreak/linebreak.c | 1216 ++++++++++--------- src/static_libs/libunibreak/linebreak.h | 38 +- src/static_libs/libunibreak/linebreakdata.c | 19 +- src/static_libs/libunibreak/linebreakdef.c | 102 +- src/static_libs/libunibreak/linebreakdef.h | 152 ++- src/static_libs/libunibreak/wordbreak.c | 628 +++++----- src/static_libs/libunibreak/wordbreak.h | 32 +- src/static_libs/libunibreak/wordbreakdef.h | 63 +- 12 files changed, 1359 insertions(+), 1100 deletions(-) diff --git a/src/static_libs/libunibreak/AUTHORS b/src/static_libs/libunibreak/AUTHORS index 22786d4201..1b4f4b41d8 100644 --- a/src/static_libs/libunibreak/AUTHORS +++ b/src/static_libs/libunibreak/AUTHORS @@ -1,4 +1,5 @@ -Wu Yongwei. Designed and implemented liblinebreak. +Wu Yongwei. Designed and implemented the original liblinebreak. +Current maintainer of libunibreak. Nikolay Pultsin. Put forward the original requirements on liblinebreak, performed tests, and made a lot of suggestions on the initial versions. @@ -6,3 +7,5 @@ performed tests, and made a lot of suggestions on the initial versions. Thomas Klausner. Autoconfiscated and libtoolized liblinebreak. Tom Hacohen. Added word boundaries support. + +Petr Filipsky. Added incremental processing for line-breaking. diff --git a/src/static_libs/libunibreak/ChangeLog b/src/static_libs/libunibreak/ChangeLog index 7d5e3b6391..feb830bd37 100644 --- a/src/static_libs/libunibreak/ChangeLog +++ b/src/static_libs/libunibreak/ChangeLog @@ -1,3 +1,116 @@ +2013-11-14 Wu Yongwei + + * src/linebreak.c: Add/update comments and doc comments. + (lb_init_breaking_class): Rename to treat_first_char. + (lb_classify_break_simple): Rename to get_lb_result_simple. + (lb_classify_break_lookup): Rename to get_lb_result_lookup. + (set_linebreaks): Remove an unused local variable. + +2013-11-14 Wu Yongwei + + * src/linebreakdata.c: Regenerate from LineBreak-6.3.0.txt. + +2013-11-13 Wu Yongwei + + Fix compilation problems under MSVC. + * src/linebreak.c (lb_init_breaking_class): Remove `inline'. + (lb_classify_break_simple): Ditto. + (lb_classify_break_lookup): Ditto. + (lb_classify_break_lookup): Move local variable declaration before + assertions. + +2013-11-10 Wu Yongwei + + * src/Makefile.am (libunibreak_la_LDFLAGS): Set the version-info to + `2:0:1'. + +2013-11-10 Wu Yongwei + + * src/linebreakdef.c: Adjust the order of code. + (lb_process_next_char): Make its return type int. + * src/linebreak.c (lb_process_next_char): Ditto. + +2013-11-10 Wu Yongwei + + * src/linebreak.c: Make minor changes in doc comments, formatting, + and names. + * src/linebreakdef.c: Ditto. + +2013-11-10 Wu Yongwei + + * AUTHORS: Add `Petr Filipsky'. + +2013-11-10 Petr Filipsky + + Expose low level line-breaking API for incremental processing. + * src/linebreak.h: Add prototype declarations for + lb_init_break_context and lb_process_next_char. + (struct LineBreakContext): New struct. + * src/linebreak.h (LINEBREAK_UNDEFINED): New macro constant. + (lb_init_breaking_class): New static function. + (lb_classify_break_simple): New static function. + (lb_classify_break_lookup): New static function. + (lb_init_break_context): New function. + (lb_process_next_char): New function. + (set_linebreaks): Implement with lb_init_break_context and + lb_process_next_char. + +2013-11-05 Petr Filipsky + + * src/wordbreakdef.h (enum WordBreakClass): Update according to + Table 3 of Unicode Standard Annex 29, Revision 23. + +2013-09-30 Wu Yongwei + + Update for the libunibreak 1.1 release. + * configure.ac (AC_INIT): Change the library version to `1.1'. + * Doxyfile (PROJECT_NUMBER): Change to `1.1'. + * Makefile.am (EXTRA_DIST): Add the `tools' directory. + * NEWS: Add information about libunibreak 1.1. + * src/Makefile.am (libunibreak_la_LDFLAGS): Set the version to `1:1'. + +2013-09-29 Wu Yongwei + + * src/Makefile.msvc: Modernize obsolete/deprecated MSVC options. + +2013-09-28 Wu Yongwei + + * src/wordbreak.c: Update copyright year and UAX information. + * src/wordbreak.h: Ditto. + * src/wordbreakdef.h: Ditto. + +2013-09-28 Wu Yongwei + + Fix the errors caused by libtool 2.4 (really annoying to the level + of WTF for making me add the foolish dependency on m4). + * Makefile.am (ACLOCAL_AMFLAGS): Add `-I m4'. + * bootstrap: Add a line to execute autoreconf. + * configure.ac (AC_CONFIG_MACRO_DIR): Set to `[m4]'. + * purge: Make it remove also the m4 directory. + +2013-09-28 Wu Yongwei + + * Makefile.am (EXTRA_DIST): Add `README.md'. + +2013-09-28 Wu Yongwei + + * README.md: New Markdown version of README. + * README: Remove. + +2013-05-13 Tom Hacohen + + Update files according to UAX #29-21, for Unicode 6.2.0. + * README: Update the reference to UAX #29-21. + * src/wordbreak.c (set_wordbreaks): Update for WBP_Regional. + * src/wordbreakdef.h (WBP_Regional): New enumerator for the new + property `RI' as defined in UAX #29-21. + * src/wordbreakdata.c: Regenerate from WordBreakProperty-6.2.0.txt. + +2013-05-06 Wu Yongwei + + * src/Makefile.am (install-exec-hook): Make sure `--disable-static' + can work (thanks to Eugene V. Lyubimkin). + 2012-10-06 Wu Yongwei Update files according to UAX #14-30, for Unicode 6.2.0. @@ -82,11 +195,12 @@ 2012-08-11 Wu Yongwei + Update for the libunibreak 1.0 release. * configure.ac (AC_INIT): Change the library name and version to `libunibreak' and `1.0'. (AC_PROG_LN_S): New macro. (AC_OUTPUT): Change to `libunibreak.pc'. - * Doxyfile: (PROJECT_NAME): Change to `libunibreak'. + * Doxyfile (PROJECT_NAME): Change to `libunibreak'. (PROJECT_NUMBER): Change to `1.0'. * LICENCE: Add copyright information about Tom Hacohen. * Makefile.am (lib_LTLIBRARIES): Change to `libunibreak.la'. @@ -96,7 +210,7 @@ a symlink to libunibreak.a. * Makefile.msvc: Change the library name to `libunibreak', and the output library to `unibreak.lib'. - * NEW: Add information about libunibreak 1.0. + * NEWS: Add information about libunibreak 1.0. * README: Change the library name, and add information about word break. diff --git a/src/static_libs/libunibreak/NEWS b/src/static_libs/libunibreak/NEWS index 581cab7cb8..3d3fcb809f 100644 --- a/src/static_libs/libunibreak/NEWS +++ b/src/static_libs/libunibreak/NEWS @@ -1,3 +1,10 @@ +New in libunibreak 1.1 + +- Update the code and data to conform to Unicode 6.2.0 +- Update build files to support libtool 2.4 +- Adjust code structure +- Make a few bug fixes + New in libunibreak 1.0 - Add word breaking support diff --git a/src/static_libs/libunibreak/README b/src/static_libs/libunibreak/README index 39b41570f3..52cd7388b5 100644 --- a/src/static_libs/libunibreak/README +++ b/src/static_libs/libunibreak/README @@ -1,31 +1,30 @@ - L I B U N I B R E A K - ===================== +LIBUNIBREAK +=========== Overview -------- This is the README file for libunibreak, an implementation of the line -breaking and word breaking algorithms as described in Unicode -Standard Annex 14 and Unicode Standard Annex 30, available at - - +breaking and word breaking algorithms as described in [Unicode Standard +Annex 14] [1] and [Unicode Standard Annex 29] [2]. Check the project's +[home page] [3] for up-to-date information. -Check this URL for up-to-date information: - + [1]: http://www.unicode.org/reports/tr14/tr14-30.html + [2]: http://www.unicode.org/reports/tr29/tr29-21.html + [3]: https://github.com/adah1972/libunibreak Licence ------- This library is released under an open-source licence, the zlib/libpng -licence. Please check the file LICENCE for details. +licence. Please check the file *LICENCE* for details. Apart from using the algorithm, part of the code is derived from the -data provided under - +[Unicode Public Data] [4], and the [Unicode Terms of Use] [5] may apply. -And the Unicode Terms of Use may apply: - + [4]: http://www.unicode.org/Public/ + [5]: http://www.unicode.org/copyright.html Installation @@ -33,56 +32,56 @@ Installation There are three ways to build the library: -1) On *NIX systems supported by the autoconfiscation tools, do the +1. On \*NIX systems supported by the autoconfiscation tools, do the normal - ./configure - make - sudo make install + ./configure + make + sudo make install to build and install both the dynamic and static libraries. In addition, one may + - type `make doc` to generate the doxygen documentation; or + - type `make linebreakdata` to regenerate *linebreakdata.c* from + *LineBreak.txt*. + - type `make wordbreakdata` to regenerate *wordbreakdata.c* from + *WordBreakProperty.txt*. - - type `make doc' to generate the doxygen documentation; or - - type `make linebreakdata' to regenerate linebreakdata.c from - LineBreak.txt. - - type `make wordbreakdata' to regenerate wordbreakdata.c from - WordBreakProperty.txt. +2. On systems where GCC and Binutils are supported, one can type -2) On systems where GCC and Binutils are supported, one can type - - cd src - cp -p Makefile.gcc Makefile - make + cd src + cp -p Makefile.gcc Makefile + make to build the static library. In addition, one may - - - type `make debug' or `make release' to explicitly generate the + - type `make debug` or `make release` to explicitly generate the debug or release build; - - type `make doc' to generate the doxygen documentation; or - - type `make linebreakdata' to regenerate linebreakdata.c from - LineBreak.txt. - - type `make wordbreakdata' to regenerate wordbreakdata.c from - WordBreakProperty.txt. + - type `make doc` to generate the doxygen documentation; or + - type `make linebreakdata` to regenerate *linebreakdata.c* from + *LineBreak.txt*. + - type `make wordbreakdata` to regenerate *wordbreakdata.c* from + *WordBreakProperty.txt*. -3) On Windows, apart from using method 1 (Cygwin/MSYS) and method 2 +3. On Windows, apart from using method 1 (Cygwin/MSYS) and method 2 (MinGW), MSVC can also be used. Type - cd src - nmake -f Makefile.msvc + cd src + nmake -f Makefile.msvc to build the static library. By default the debug release is built. To build the release version - nmake -f Makefile.msvc CFG="libunibreak - Win32 Release" + nmake -f Makefile.msvc CFG="libunibreak - Win32 Release" Documentation ------------- -Check the generated document doc/html/linebreak_8h.html and -doc/html/wordbreak_8h.html in the downloaded file for the public +Check the generated document *doc/html/linebreak\_8h.html* and +*doc/html/wordbreak\_8h.html* in the downloaded file for the public interfaces exposed to applications. + diff --git a/src/static_libs/libunibreak/linebreak.c b/src/static_libs/libunibreak/linebreak.c index c1ea405883..9716df4860 100644 --- a/src/static_libs/libunibreak/linebreak.c +++ b/src/static_libs/libunibreak/linebreak.c @@ -1,10 +1,11 @@ -/* vim: set tabstop=4 shiftwidth=4: */ +/* vim: set expandtab tabstop=4 softtabstop=4 shiftwidth=4: */ /* * Line breaking in a Unicode sequence. Designed to be used in a * generic text renderer. * - * Copyright (C) 2008-2012 Wu Yongwei + * Copyright (C) 2008-2013 Wu Yongwei + * Copyright (C) 2013 Petr Filipsky * * This software is provided 'as-is', without any express or implied * warranty. In no event will the author be held liable for any damages @@ -24,28 +25,29 @@ * distribution. * * The main reference is Unicode Standard Annex 14 (UAX #14): - * + * * * When this library was designed, this annex was at Revision 19, for * Unicode 5.0.0: - * + * * * This library has been updated according to Revision 30, for * Unicode 6.2.0: - * + * * * The Unicode Terms of Use are available at - * + * */ /** - * @file linebreak.c + * @file linebreak.c * * Implementation of the line breaking algorithm as described in Unicode * Standard Annex 14. * - * @version 2.3, 2012/10/06 - * @author Wu Yongwei + * @version 2.5, 2013/11/14 + * @author Wu Yongwei + * @author Petr Filipsky */ #include @@ -54,6 +56,11 @@ #include "linebreak.h" #include "linebreakdef.h" +/** + * Special value used internally to indicate an undefined break result. + */ +#define LINEBREAK_UNDEFINED -1 + /** * Size of the second-level index to the line breaking properties. */ @@ -70,11 +77,11 @@ const int linebreak_version = LINEBREAK_VERSION; */ enum BreakAction { - DIR_BRK, /**< Direct break opportunity */ - IND_BRK, /**< Indirect break opportunity */ - CMI_BRK, /**< Indirect break opportunity for combining marks */ - CMP_BRK, /**< Prohibited break for combining marks */ - PRH_BRK /**< Prohibited break */ + DIR_BRK, /**< Direct break opportunity */ + IND_BRK, /**< Indirect break opportunity */ + CMI_BRK, /**< Indirect break opportunity for combining marks */ + CMP_BRK, /**< Prohibited break for combining marks */ + PRH_BRK /**< Prohibited break */ }; /** @@ -82,180 +89,180 @@ enum BreakAction * Unicode Standard Annex 14, Revision 30. */ static enum BreakAction baTable[LBP_RI][LBP_RI] = { - { /* OP */ - PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, - PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, - PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, - CMP_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, - PRH_BRK }, - { /* CL */ - DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, PRH_BRK, PRH_BRK, - PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK, - DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, - CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, - DIR_BRK }, - { /* CP */ - DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, PRH_BRK, PRH_BRK, - PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, - DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, - CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, - DIR_BRK }, - { /* QU */ - PRH_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, - PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, - IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, - CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, - IND_BRK }, - { /* GL */ - IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, - PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, - IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, - CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, - IND_BRK }, - { /* NS */ - DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, - PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, - DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, - CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, - DIR_BRK }, - { /* EX */ - DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, - PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, - DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, - CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, - DIR_BRK }, - { /* SY */ - DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, - PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, - DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, - CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, - DIR_BRK }, - { /* IS */ - DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, - PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK, - DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, - CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, - DIR_BRK }, - { /* PR */ - IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, - PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK, - IND_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, - CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, - DIR_BRK }, - { /* PO */ - IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, - PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK, - DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, - CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, - DIR_BRK }, - { /* NU */ - IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, - PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, - DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, - CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, - DIR_BRK }, - { /* AL */ - IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, - PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK, - DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, - CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, - DIR_BRK }, - { /* HL */ - IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, - PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK, - DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, - CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, - DIR_BRK }, - { /* ID */ - DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, - PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK, - DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, - CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, - DIR_BRK }, - { /* IN */ - DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, - PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, - DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, - CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, - DIR_BRK }, - { /* HY */ - DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, DIR_BRK, IND_BRK, PRH_BRK, - PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, - DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, - CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, - DIR_BRK }, - { /* BA */ - DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, DIR_BRK, IND_BRK, PRH_BRK, - PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, - DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, - CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, - DIR_BRK }, - { /* BB */ - IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, - PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, - IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, - CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, - IND_BRK }, - { /* B2 */ - DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, - PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, - DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, PRH_BRK, PRH_BRK, - CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, - DIR_BRK }, - { /* ZW */ - DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, - DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, - DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, PRH_BRK, - DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, - DIR_BRK }, - { /* CM */ - IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, - PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK, - DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, - CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, - DIR_BRK }, - { /* WJ */ - IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, - PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, - IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, - CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, - IND_BRK }, - { /* H2 */ - DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, - PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK, - DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, - CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, - DIR_BRK }, - { /* H3 */ - DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, - PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK, - DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, - CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, - DIR_BRK }, - { /* JL */ - DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, - PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK, - DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, - CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, - DIR_BRK }, - { /* JV */ - DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, - PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK, - DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, - CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, - DIR_BRK }, - { /* JT */ - DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, - PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK, - DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, - CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, - DIR_BRK }, - { /* RI */ - DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, - PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, - DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, - CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, - IND_BRK }, + { /* OP */ + PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, + PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, + PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, + CMP_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, + PRH_BRK }, + { /* CL */ + DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, PRH_BRK, PRH_BRK, + PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK, + DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, + CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, + DIR_BRK }, + { /* CP */ + DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, PRH_BRK, PRH_BRK, + PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, + DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, + CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, + DIR_BRK }, + { /* QU */ + PRH_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, + PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, + IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, + CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, + IND_BRK }, + { /* GL */ + IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, + PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, + IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, + CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, + IND_BRK }, + { /* NS */ + DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, + PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, + DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, + CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, + DIR_BRK }, + { /* EX */ + DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, + PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, + DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, + CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, + DIR_BRK }, + { /* SY */ + DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, + PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, + DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, + CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, + DIR_BRK }, + { /* IS */ + DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, + PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK, + DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, + CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, + DIR_BRK }, + { /* PR */ + IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, + PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK, + IND_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, + CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, + DIR_BRK }, + { /* PO */ + IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, + PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK, + DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, + CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, + DIR_BRK }, + { /* NU */ + IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, + PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, + DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, + CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, + DIR_BRK }, + { /* AL */ + IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, + PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK, + DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, + CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, + DIR_BRK }, + { /* HL */ + IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, + PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK, + DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, + CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, + DIR_BRK }, + { /* ID */ + DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, + PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK, + DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, + CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, + DIR_BRK }, + { /* IN */ + DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, + PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, + DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, + CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, + DIR_BRK }, + { /* HY */ + DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, DIR_BRK, IND_BRK, PRH_BRK, + PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, + DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, + CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, + DIR_BRK }, + { /* BA */ + DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, DIR_BRK, IND_BRK, PRH_BRK, + PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, + DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, + CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, + DIR_BRK }, + { /* BB */ + IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, + PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, + IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, + CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, + IND_BRK }, + { /* B2 */ + DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, + PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, + DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, PRH_BRK, PRH_BRK, + CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, + DIR_BRK }, + { /* ZW */ + DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, + DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, + DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, PRH_BRK, + DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, + DIR_BRK }, + { /* CM */ + IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, + PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK, + DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, + CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, + DIR_BRK }, + { /* WJ */ + IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, + PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, + IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, + CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, + IND_BRK }, + { /* H2 */ + DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, + PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK, + DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, + CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, + DIR_BRK }, + { /* H3 */ + DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, + PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK, + DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, + CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, + DIR_BRK }, + { /* JL */ + DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, + PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK, + DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, + CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, + DIR_BRK }, + { /* JV */ + DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, + PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK, + DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, + CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, + DIR_BRK }, + { /* JT */ + DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, + PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK, + DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, + CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, + DIR_BRK }, + { /* RI */ + DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, + PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, + DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, + CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, + IND_BRK }, }; /** @@ -263,8 +270,8 @@ static enum BreakAction baTable[LBP_RI][LBP_RI] = { */ struct LineBreakPropertiesIndex { - utf32_t end; /**< End coding point */ - struct LineBreakProperties *lbp;/**< Pointer to line breaking properties */ + utf32_t end; /**< End coding point */ + struct LineBreakProperties *lbp;/**< Pointer to line breaking properties */ }; /** @@ -272,7 +279,7 @@ struct LineBreakPropertiesIndex */ static struct LineBreakPropertiesIndex lb_prop_index[LINEBREAK_INDEX_SIZE] = { - { 0xFFFFFFFF, lb_prop_default } + { 0xFFFFFFFF, lb_prop_default } }; /** @@ -283,84 +290,84 @@ static struct LineBreakPropertiesIndex lb_prop_index[LINEBREAK_INDEX_SIZE] = */ void init_linebreak(void) { - size_t i; - size_t iPropDefault; - size_t len; - size_t step; + size_t i; + size_t iPropDefault; + size_t len; + size_t step; - len = 0; - while (lb_prop_default[len].prop != LBP_Undefined) - ++len; - step = len / LINEBREAK_INDEX_SIZE; - iPropDefault = 0; - for (i = 0; i < LINEBREAK_INDEX_SIZE; ++i) - { - lb_prop_index[i].lbp = lb_prop_default + iPropDefault; - iPropDefault += step; - lb_prop_index[i].end = lb_prop_default[iPropDefault].start - 1; - } - lb_prop_index[--i].end = 0xFFFFFFFF; + len = 0; + while (lb_prop_default[len].prop != LBP_Undefined) + ++len; + step = len / LINEBREAK_INDEX_SIZE; + iPropDefault = 0; + for (i = 0; i < LINEBREAK_INDEX_SIZE; ++i) + { + lb_prop_index[i].lbp = lb_prop_default + iPropDefault; + iPropDefault += step; + lb_prop_index[i].end = lb_prop_default[iPropDefault].start - 1; + } + lb_prop_index[--i].end = 0xFFFFFFFF; } /** * Gets the language-specific line breaking properties. * - * @param lang language of the text - * @return pointer to the language-specific line breaking - * properties array if found; \c NULL otherwise + * @param lang language of the text + * @return pointer to the language-specific line breaking + * properties array if found; \c NULL otherwise */ static struct LineBreakProperties *get_lb_prop_lang(const char *lang) { - struct LineBreakPropertiesLang *lbplIter; - if (lang != NULL) - { - for (lbplIter = lb_prop_lang_map; lbplIter->lang != NULL; ++lbplIter) - { - if (strncmp(lang, lbplIter->lang, lbplIter->namelen) == 0) - { - return lbplIter->lbp; - } - } - } - return NULL; + struct LineBreakPropertiesLang *lbplIter; + if (lang != NULL) + { + for (lbplIter = lb_prop_lang_map; lbplIter->lang != NULL; ++lbplIter) + { + if (strncmp(lang, lbplIter->lang, lbplIter->namelen) == 0) + { + return lbplIter->lbp; + } + } + } + return NULL; } /** * Gets the line breaking class of a character from a line breaking * properties array. * - * @param ch character to check - * @param lbp pointer to the line breaking properties array - * @return the line breaking class if found; \c LBP_XX otherwise + * @param ch character to check + * @param lbp pointer to the line breaking properties array + * @return the line breaking class if found; \c LBP_XX otherwise */ static enum LineBreakClass get_char_lb_class( - utf32_t ch, - struct LineBreakProperties *lbp) + utf32_t ch, + struct LineBreakProperties *lbp) { - while (lbp->prop != LBP_Undefined && ch >= lbp->start) - { - if (ch <= lbp->end) - return lbp->prop; - ++lbp; - } - return LBP_XX; + while (lbp->prop != LBP_Undefined && ch >= lbp->start) + { + if (ch <= lbp->end) + return lbp->prop; + ++lbp; + } + return LBP_XX; } /** * Gets the line breaking class of a character from the default line * breaking properties array. * - * @param ch character to check - * @return the line breaking class if found; \c LBP_XX otherwise + * @param ch character to check + * @return the line breaking class if found; \c LBP_XX otherwise */ static enum LineBreakClass get_char_lb_class_default( - utf32_t ch) + utf32_t ch) { - size_t i = 0; - while (ch > lb_prop_index[i].end) - ++i; - assert(i < LINEBREAK_INDEX_SIZE); - return get_char_lb_class(ch, lb_prop_index[i].lbp); + size_t i = 0; + while (ch > lb_prop_index[i].end) + ++i; + assert(i < LINEBREAK_INDEX_SIZE); + return get_char_lb_class(ch, lb_prop_index[i].lbp); } /** @@ -369,30 +376,30 @@ static enum LineBreakClass get_char_lb_class_default( * and then the default data if there is no language-specific property * available for the character. * - * @param ch character to check - * @param lbpLang pointer to the language-specific line breaking - * properties array - * @return the line breaking class if found; \c LBP_XX - * otherwise + * @param ch character to check + * @param lbpLang pointer to the language-specific line breaking + * properties array + * @return the line breaking class if found; \c LBP_XX + * otherwise */ static enum LineBreakClass get_char_lb_class_lang( - utf32_t ch, - struct LineBreakProperties *lbpLang) + utf32_t ch, + struct LineBreakProperties *lbpLang) { - enum LineBreakClass lbcResult; + enum LineBreakClass lbcResult; - /* Find the language-specific line breaking class for a character */ - if (lbpLang) - { - lbcResult = get_char_lb_class(ch, lbpLang); - if (lbcResult != LBP_XX) - return lbcResult; - } + /* Find the language-specific line breaking class for a character */ + if (lbpLang) + { + lbcResult = get_char_lb_class(ch, lbpLang); + if (lbcResult != LBP_XX) + return lbcResult; + } - /* Find the generic language-specific line breaking class, if no - * language context is provided, or language-specific data are not - * available for the specific character in the specified language */ - return get_char_lb_class_default(ch); + /* Find the generic language-specific line breaking class, if no + * language context is provided, or language-specific data are not + * available for the specific character in the specified language */ + return get_char_lb_class_default(ch); } /** @@ -400,40 +407,214 @@ static enum LineBreakClass get_char_lb_class_lang( * characters. They are treated in a simplistic way in this * implementation. * - * @param lbc line breaking class to resolve - * @param lang language of the text - * @return the resolved line breaking class + * @param lbc line breaking class to resolve + * @param lang language of the text + * @return the resolved line breaking class */ static enum LineBreakClass resolve_lb_class( - enum LineBreakClass lbc, - const char *lang) + enum LineBreakClass lbc, + const char *lang) { - switch (lbc) - { - case LBP_AI: - if (lang != NULL && - (strncmp(lang, "zh", 2) == 0 || /* Chinese */ - strncmp(lang, "ja", 2) == 0 || /* Japanese */ - strncmp(lang, "ko", 2) == 0)) /* Korean */ - { - return LBP_ID; - } - else - { - return LBP_AL; - } - case LBP_CJ: - /* Simplified for `normal' line breaking. See - * - * for details. */ - return LBP_ID; - case LBP_SA: - case LBP_SG: - case LBP_XX: - return LBP_AL; - default: - return lbc; - } + switch (lbc) + { + case LBP_AI: + if (lang != NULL && + (strncmp(lang, "zh", 2) == 0 || /* Chinese */ + strncmp(lang, "ja", 2) == 0 || /* Japanese */ + strncmp(lang, "ko", 2) == 0)) /* Korean */ + { + return LBP_ID; + } + else + { + return LBP_AL; + } + case LBP_CJ: + /* Simplified for `normal' line breaking. See + * + * for details. */ + return LBP_ID; + case LBP_SA: + case LBP_SG: + case LBP_XX: + return LBP_AL; + default: + return lbc; + } +} + +/** + * Treats specially for the first character in a line. + * + * @param[in,out] lbpCtx pointer to the line breaking context + * @pre \a lbpCtx->lbcCur has a valid line break class + * @post \a lbpCtx->lbcCur has the updated line break class + */ +static void treat_first_char( + struct LineBreakContext* lbpCtx) +{ + switch (lbpCtx->lbcCur) + { + case LBP_LF: + case LBP_NL: + lbpCtx->lbcCur = LBP_BK; /* Rule LB5 */ + break; + case LBP_CB: + lbpCtx->lbcCur = LBP_BA; /* Rule LB20 */ + break; + case LBP_SP: + lbpCtx->lbcCur = LBP_WJ; /* Leading space treated as WJ */ + break; + default: + break; + } +} + +/** + * Tries telling the line break opportunity by simple rules. + * + * @param[in,out] lbpCtx pointer to the line breaking context + * @pre \a lbpCtx->lbcCur has the current line break + * class; and \a lbpCtx->lbcNew has the line + * break class for the next character + * @post \a lbpCtx->lbcCur has the updated line break + * class + * @return break result, one of #LINEBREAK_MUSTBREAK, + * #LINEBREAK_ALLOWBREAK, and #LINEBREAK_NOBREAK + * if identified; or #LINEBREAK_UNDEFINED if + * table lookup is needed + */ +static int get_lb_result_simple( + struct LineBreakContext* lbpCtx) +{ + if (lbpCtx->lbcCur == LBP_BK + || (lbpCtx->lbcCur == LBP_CR && lbpCtx->lbcNew != LBP_LF)) + { + return LINEBREAK_MUSTBREAK; /* Rules LB4 and LB5 */ + } + + switch (lbpCtx->lbcNew) + { + case LBP_SP: + return LINEBREAK_NOBREAK; /* Rule LB7; no change to lbcCur */ + case LBP_BK: + case LBP_LF: + case LBP_NL: + lbpCtx->lbcCur = LBP_BK; /* Mandatory break after */ + return LINEBREAK_NOBREAK; /* Rule LB6 */ + case LBP_CR: + lbpCtx->lbcCur = LBP_CR; + return LINEBREAK_NOBREAK; /* Rule LB6 */ + case LBP_CB: + lbpCtx->lbcCur = LBP_BA; + return LINEBREAK_ALLOWBREAK; /* Rule LB20 */ + default: + return LINEBREAK_UNDEFINED; /* Table lookup is needed */ + } +} + +/** + * Tells the line break opportunity by table lookup. + * + * @param[in,out] lbpCtx pointer to the line breaking context + * @pre \a lbpCtx->lbcCur has the current line break + * class; \a lbpCtx->lbcLast has the line break + * class for the last character; and \a + * lbcCur->lbcNew has the line break class for + * the next character + * @post \a lbpCtx->lbcCur has the updated line break + * class + * @return break result, one of #LINEBREAK_MUSTBREAK, + * #LINEBREAK_ALLOWBREAK, and #LINEBREAK_NOBREAK + */ +static int get_lb_result_lookup( + struct LineBreakContext* lbpCtx) +{ + /* TODO: Rule LB21a, as introduced by Revision 28 of UAX#14, is not + * yet implemented below. */ + int brk = LINEBREAK_UNDEFINED; + assert(lbpCtx->lbcCur <= LBP_JT); + assert(lbpCtx->lbcNew <= LBP_JT); + switch (baTable[lbpCtx->lbcCur - 1][lbpCtx->lbcNew - 1]) + { + case DIR_BRK: + brk = LINEBREAK_ALLOWBREAK; + break; + case CMI_BRK: + case IND_BRK: + brk = (lbpCtx->lbcLast == LBP_SP) + ? LINEBREAK_ALLOWBREAK + : LINEBREAK_NOBREAK; + break; + case CMP_BRK: + brk = LINEBREAK_NOBREAK; + if (lbpCtx->lbcLast != LBP_SP) + return brk; /* Do not update lbcCur */ + break; + case PRH_BRK: + brk = LINEBREAK_NOBREAK; + break; + } + lbpCtx->lbcCur = lbpCtx->lbcNew; + return brk; +} + +/** + * Initializes line breaking context for a given language. + * + * @param[in,out] lbpCtx pointer to the line breaking context + * @param[in] ch the first character to process + * @param[in] lang language of the input + * @post the line breaking context is initialized + */ +void lb_init_break_context( + struct LineBreakContext* lbpCtx, + utf32_t ch, + const char* lang) +{ + lbpCtx->lang = lang; + lbpCtx->lbpLang = get_lb_prop_lang(lang); + lbpCtx->lbcLast = LBP_Undefined; + lbpCtx->lbcNew = LBP_Undefined; + lbpCtx->lbcCur = resolve_lb_class( + get_char_lb_class_lang(ch, lbpCtx->lbpLang), + lbpCtx->lang); + treat_first_char(lbpCtx); +} + +/** + * Updates LineBreakingContext for the next code point and returns + * the detected break. + * + * @param[in,out] lbpCtx pointer to the line breaking context + * @param[in] ch Unicode code point + * @return break result, one of #LINEBREAK_MUSTBREAK, + * #LINEBREAK_ALLOWBREAK, and #LINEBREAK_NOBREAK + * @post the line breaking context is updated + */ +int lb_process_next_char( + struct LineBreakContext* lbpCtx, + utf32_t ch ) +{ + int brk; + + lbpCtx->lbcLast = lbpCtx->lbcNew; + lbpCtx->lbcNew = get_char_lb_class_lang(ch, lbpCtx->lbpLang); + brk = get_lb_result_simple(lbpCtx); + switch (brk) + { + case LINEBREAK_MUSTBREAK: + lbpCtx->lbcCur = resolve_lb_class(lbpCtx->lbcNew, lbpCtx->lang); + treat_first_char(lbpCtx); + break; + case LINEBREAK_UNDEFINED: + lbpCtx->lbcNew = resolve_lb_class(lbpCtx->lbcNew, lbpCtx->lang); + brk = get_lb_result_lookup(lbpCtx); + break; + default: + break; + } + return brk; } /** @@ -441,59 +622,59 @@ static enum LineBreakClass resolve_lb_class( * be advanced to the next complete character, unless the end of string * is reached in the middle of a UTF-8 sequence. * - * @param[in] s input UTF-8 string - * @param[in] len length of the string in bytes - * @param[in,out] ip pointer to the index - * @return the Unicode character beginning at the index; or - * #EOS if end of input is encountered + * @param[in] s input UTF-8 string + * @param[in] len length of the string in bytes + * @param[in,out] ip pointer to the index + * @return the Unicode character beginning at the index; or + * #EOS if end of input is encountered */ utf32_t lb_get_next_char_utf8( - const utf8_t *s, - size_t len, - size_t *ip) + const utf8_t *s, + size_t len, + size_t *ip) { - utf8_t ch; - utf32_t res; + utf8_t ch; + utf32_t res; - assert(*ip <= len); - if (*ip == len) - return EOS; - ch = s[*ip]; + assert(*ip <= len); + if (*ip == len) + return EOS; + ch = s[*ip]; - if (ch < 0xC2 || ch > 0xF4) - { /* One-byte sequence, tail (should not occur), or invalid */ - *ip += 1; - return ch; - } - else if (ch < 0xE0) - { /* Two-byte sequence */ - if (*ip + 2 > len) - return EOS; - res = ((ch & 0x1F) << 6) + (s[*ip + 1] & 0x3F); - *ip += 2; - return res; - } - else if (ch < 0xF0) - { /* Three-byte sequence */ - if (*ip + 3 > len) - return EOS; - res = ((ch & 0x0F) << 12) + - ((s[*ip + 1] & 0x3F) << 6) + - ((s[*ip + 2] & 0x3F)); - *ip += 3; - return res; - } - else - { /* Four-byte sequence */ - if (*ip + 4 > len) - return EOS; - res = ((ch & 0x07) << 18) + - ((s[*ip + 1] & 0x3F) << 12) + - ((s[*ip + 2] & 0x3F) << 6) + - ((s[*ip + 3] & 0x3F)); - *ip += 4; - return res; - } + if (ch < 0xC2 || ch > 0xF4) + { /* One-byte sequence, tail (should not occur), or invalid */ + *ip += 1; + return ch; + } + else if (ch < 0xE0) + { /* Two-byte sequence */ + if (*ip + 2 > len) + return EOS; + res = ((ch & 0x1F) << 6) + (s[*ip + 1] & 0x3F); + *ip += 2; + return res; + } + else if (ch < 0xF0) + { /* Three-byte sequence */ + if (*ip + 3 > len) + return EOS; + res = ((ch & 0x0F) << 12) + + ((s[*ip + 1] & 0x3F) << 6) + + ((s[*ip + 2] & 0x3F)); + *ip += 3; + return res; + } + else + { /* Four-byte sequence */ + if (*ip + 4 > len) + return EOS; + res = ((ch & 0x07) << 18) + + ((s[*ip + 1] & 0x3F) << 12) + + ((s[*ip + 2] & 0x3F) << 6) + + ((s[*ip + 3] & 0x3F)); + *ip += 4; + return res; + } } /** @@ -501,263 +682,174 @@ utf32_t lb_get_next_char_utf8( * be advanced to the next complete character, unless the end of string * is reached in the middle of a UTF-16 surrogate pair. * - * @param[in] s input UTF-16 string - * @param[in] len length of the string in words - * @param[in,out] ip pointer to the index - * @return the Unicode character beginning at the index; or - * #EOS if end of input is encountered + * @param[in] s input UTF-16 string + * @param[in] len length of the string in words + * @param[in,out] ip pointer to the index + * @return the Unicode character beginning at the index; or + * #EOS if end of input is encountered */ utf32_t lb_get_next_char_utf16( - const utf16_t *s, - size_t len, - size_t *ip) + const utf16_t *s, + size_t len, + size_t *ip) { - utf16_t ch; + utf16_t ch; - assert(*ip <= len); - if (*ip == len) - return EOS; - ch = s[(*ip)++]; + assert(*ip <= len); + if (*ip == len) + return EOS; + ch = s[(*ip)++]; - if (ch < 0xD800 || ch > 0xDBFF) - { /* If the character is not a high surrogate */ - return ch; - } - if (*ip == len) - { /* If the input ends here (an error) */ - --(*ip); - return EOS; - } - if (s[*ip] < 0xDC00 || s[*ip] > 0xDFFF) - { /* If the next character is not the low surrogate (an error) */ - return ch; - } - /* Return the constructed character and advance the index again */ - return (((utf32_t)ch & 0x3FF) << 10) + (s[(*ip)++] & 0x3FF) + 0x10000; + if (ch < 0xD800 || ch > 0xDBFF) + { /* If the character is not a high surrogate */ + return ch; + } + if (*ip == len) + { /* If the input ends here (an error) */ + --(*ip); + return EOS; + } + if (s[*ip] < 0xDC00 || s[*ip] > 0xDFFF) + { /* If the next character is not the low surrogate (an error) */ + return ch; + } + /* Return the constructed character and advance the index again */ + return (((utf32_t)ch & 0x3FF) << 10) + (s[(*ip)++] & 0x3FF) + 0x10000; } /** * Gets the next Unicode character in a UTF-32 sequence. The index will * be advanced to the next character. * - * @param[in] s input UTF-32 string - * @param[in] len length of the string in dwords - * @param[in,out] ip pointer to the index - * @return the Unicode character beginning at the index; or - * #EOS if end of input is encountered + * @param[in] s input UTF-32 string + * @param[in] len length of the string in dwords + * @param[in,out] ip pointer to the index + * @return the Unicode character beginning at the index; or + * #EOS if end of input is encountered */ utf32_t lb_get_next_char_utf32( - const utf32_t *s, - size_t len, - size_t *ip) + const utf32_t *s, + size_t len, + size_t *ip) { - assert(*ip <= len); - if (*ip == len) - return EOS; - return s[(*ip)++]; + assert(*ip <= len); + if (*ip == len) + return EOS; + return s[(*ip)++]; } /** * Sets the line breaking information for a generic input string. * - * @param[in] s input string - * @param[in] len length of the input - * @param[in] lang language of the input - * @param[out] brks pointer to the output breaking data, - * containing #LINEBREAK_MUSTBREAK, - * #LINEBREAK_ALLOWBREAK, #LINEBREAK_NOBREAK, - * or #LINEBREAK_INSIDEACHAR - * @param[in] get_next_char function to get the next UTF-32 character + * @param[in] s input string + * @param[in] len length of the input + * @param[in] lang language of the input + * @param[out] brks pointer to the output breaking data, + * containing #LINEBREAK_MUSTBREAK, + * #LINEBREAK_ALLOWBREAK, #LINEBREAK_NOBREAK, + * or #LINEBREAK_INSIDEACHAR + * @param[in] get_next_char function to get the next UTF-32 character */ void set_linebreaks( - const void *s, - size_t len, - const char *lang, - char *brks, - get_next_char_t get_next_char) + const void *s, + size_t len, + const char *lang, + char *brks, + get_next_char_t get_next_char) { - utf32_t ch; - enum LineBreakClass lbcCur; - enum LineBreakClass lbcNew; - enum LineBreakClass lbcLast; - struct LineBreakProperties *lbpLang; - size_t posCur = 0; - size_t posLast = 0; + utf32_t ch; + struct LineBreakContext lbCtx; + size_t posCur = 0; + size_t posLast = 0; - --posLast; /* To be ++'d later */ - ch = get_next_char(s, len, &posCur); - if (ch == EOS) - return; - lbpLang = get_lb_prop_lang(lang); - lbcCur = resolve_lb_class(get_char_lb_class_lang(ch, lbpLang), lang); - lbcNew = LBP_Undefined; + --posLast; /* To be ++'d later */ + ch = get_next_char(s, len, &posCur); + if (ch == EOS) + return; + lb_init_break_context(&lbCtx, ch, lang); -nextline: + /* Process a line till an explicit break or end of string */ + for (;;) + { + for (++posLast; posLast < posCur - 1; ++posLast) + { + brks[posLast] = LINEBREAK_INSIDEACHAR; + } + assert(posLast == posCur - 1); + ch = get_next_char(s, len, &posCur); + if (ch == EOS) + break; + brks[posLast] = lb_process_next_char(&lbCtx, ch); + } - /* Special treatment for the first character */ - switch (lbcCur) - { - case LBP_LF: - case LBP_NL: - lbcCur = LBP_BK; - break; - case LBP_CB: - lbcCur = LBP_BA; - break; - case LBP_SP: - lbcCur = LBP_WJ; - break; - default: - break; - } - - /* Process a line till an explicit break or end of string */ - for (;;) - { - for (++posLast; posLast < posCur - 1; ++posLast) - { - brks[posLast] = LINEBREAK_INSIDEACHAR; - } - assert(posLast == posCur - 1); - lbcLast = lbcNew; - ch = get_next_char(s, len, &posCur); - if (ch == EOS) - break; - lbcNew = get_char_lb_class_lang(ch, lbpLang); - if (lbcCur == LBP_BK || (lbcCur == LBP_CR && lbcNew != LBP_LF)) - { - brks[posLast] = LINEBREAK_MUSTBREAK; - lbcCur = resolve_lb_class(lbcNew, lang); - goto nextline; - } - - switch (lbcNew) - { - case LBP_SP: - brks[posLast] = LINEBREAK_NOBREAK; - continue; - case LBP_BK: - case LBP_LF: - case LBP_NL: - brks[posLast] = LINEBREAK_NOBREAK; - lbcCur = LBP_BK; - continue; - case LBP_CR: - brks[posLast] = LINEBREAK_NOBREAK; - lbcCur = LBP_CR; - continue; - case LBP_CB: - brks[posLast] = LINEBREAK_ALLOWBREAK; - lbcCur = LBP_BA; - continue; - default: - break; - } - - lbcNew = resolve_lb_class(lbcNew, lang); - - /* TODO: LB21a, as introduced by Revision 28 of UAX#14, is not - * yet implemented below. */ - - assert(lbcCur <= LBP_JT); - assert(lbcNew <= LBP_JT); - switch (baTable[lbcCur - 1][lbcNew - 1]) - { - case DIR_BRK: - brks[posLast] = LINEBREAK_ALLOWBREAK; - break; - case CMI_BRK: - case IND_BRK: - if (lbcLast == LBP_SP) - { - brks[posLast] = LINEBREAK_ALLOWBREAK; - } - else - { - brks[posLast] = LINEBREAK_NOBREAK; - } - break; - case CMP_BRK: - brks[posLast] = LINEBREAK_NOBREAK; - if (lbcLast != LBP_SP) - continue; - break; - case PRH_BRK: - brks[posLast] = LINEBREAK_NOBREAK; - break; - } - - lbcCur = lbcNew; - } - - assert(posLast == posCur - 1 && posCur <= len); - /* Break after the last character */ - brks[posLast] = LINEBREAK_MUSTBREAK; - /* When the input contains incomplete sequences */ - while (posCur < len) - { - brks[posCur++] = LINEBREAK_INSIDEACHAR; - } + assert(posLast == posCur - 1 && posCur <= len); + /* Break after the last character */ + brks[posLast] = LINEBREAK_MUSTBREAK; + /* When the input contains incomplete sequences */ + while (posCur < len) + { + brks[posCur++] = LINEBREAK_INSIDEACHAR; + } } /** * Sets the line breaking information for a UTF-8 input string. * - * @param[in] s input UTF-8 string - * @param[in] len length of the input - * @param[in] lang language of the input - * @param[out] brks pointer to the output breaking data, containing - * #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK, - * #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR + * @param[in] s input UTF-8 string + * @param[in] len length of the input + * @param[in] lang language of the input + * @param[out] brks pointer to the output breaking data, containing + * #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK, + * #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR */ void set_linebreaks_utf8( - const utf8_t *s, - size_t len, - const char *lang, - char *brks) + const utf8_t *s, + size_t len, + const char *lang, + char *brks) { - set_linebreaks(s, len, lang, brks, - (get_next_char_t)lb_get_next_char_utf8); + set_linebreaks(s, len, lang, brks, + (get_next_char_t)lb_get_next_char_utf8); } /** * Sets the line breaking information for a UTF-16 input string. * - * @param[in] s input UTF-16 string - * @param[in] len length of the input - * @param[in] lang language of the input - * @param[out] brks pointer to the output breaking data, containing - * #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK, - * #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR + * @param[in] s input UTF-16 string + * @param[in] len length of the input + * @param[in] lang language of the input + * @param[out] brks pointer to the output breaking data, containing + * #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK, + * #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR */ void set_linebreaks_utf16( - const utf16_t *s, - size_t len, - const char *lang, - char *brks) + const utf16_t *s, + size_t len, + const char *lang, + char *brks) { - set_linebreaks(s, len, lang, brks, - (get_next_char_t)lb_get_next_char_utf16); + set_linebreaks(s, len, lang, brks, + (get_next_char_t)lb_get_next_char_utf16); } /** * Sets the line breaking information for a UTF-32 input string. * - * @param[in] s input UTF-32 string - * @param[in] len length of the input - * @param[in] lang language of the input - * @param[out] brks pointer to the output breaking data, containing - * #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK, - * #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR + * @param[in] s input UTF-32 string + * @param[in] len length of the input + * @param[in] lang language of the input + * @param[out] brks pointer to the output breaking data, containing + * #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK, + * #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR */ void set_linebreaks_utf32( - const utf32_t *s, - size_t len, - const char *lang, - char *brks) + const utf32_t *s, + size_t len, + const char *lang, + char *brks) { - set_linebreaks(s, len, lang, brks, - (get_next_char_t)lb_get_next_char_utf32); + set_linebreaks(s, len, lang, brks, + (get_next_char_t)lb_get_next_char_utf32); } /** @@ -767,21 +859,21 @@ void set_linebreaks_utf32( * complicated cases involving combining marks, spaces, etc. cannot be * correctly processed. * - * @param char1 the first Unicode character - * @param char2 the second Unicode character - * @param lang language of the input - * @return one of #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK, - * #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR + * @param char1 the first Unicode character + * @param char2 the second Unicode character + * @param lang language of the input + * @return one of #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK, + * #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR */ int is_line_breakable( - utf32_t char1, - utf32_t char2, - const char* lang) + utf32_t char1, + utf32_t char2, + const char* lang) { - utf32_t s[2]; - char brks[2]; - s[0] = char1; - s[1] = char2; - set_linebreaks_utf32(s, 2, lang, brks); - return brks[0]; + utf32_t s[2]; + char brks[2]; + s[0] = char1; + s[1] = char2; + set_linebreaks_utf32(s, 2, lang, brks); + return brks[0]; } diff --git a/src/static_libs/libunibreak/linebreak.h b/src/static_libs/libunibreak/linebreak.h index 288ef1b352..94fbca0f9f 100644 --- a/src/static_libs/libunibreak/linebreak.h +++ b/src/static_libs/libunibreak/linebreak.h @@ -1,4 +1,4 @@ -/* vim: set tabstop=4 shiftwidth=4: */ +/* vim: set expandtab tabstop=4 softtabstop=4 shiftwidth=4: */ /* * Line breaking in a Unicode sequence. Designed to be used in a @@ -24,27 +24,27 @@ * distribution. * * The main reference is Unicode Standard Annex 14 (UAX #14): - * + * * * When this library was designed, this annex was at Revision 19, for * Unicode 5.0.0: - * + * * * This library has been updated according to Revision 30, for * Unicode 6.2.0: - * + * * * The Unicode Terms of Use are available at - * + * */ /** - * @file linebreak.h + * @file linebreak.h * * Header file for the line breaking algorithm. * - * @version 2.2, 2012/10/06 - * @author Wu Yongwei + * @version 2.2, 2012/10/06 + * @author Wu Yongwei */ #ifndef LINEBREAK_H @@ -56,28 +56,28 @@ extern "C" { #endif -#define LINEBREAK_VERSION 0x0202 /**< Version of the library linebreak */ +#define LINEBREAK_VERSION 0x0202 /**< Version of the library linebreak */ extern const int linebreak_version; #ifndef LINEBREAK_UTF_TYPES_DEFINED #define LINEBREAK_UTF_TYPES_DEFINED -typedef unsigned char utf8_t; /**< Type for UTF-8 data points */ -typedef unsigned short utf16_t; /**< Type for UTF-16 data points */ -typedef unsigned int utf32_t; /**< Type for UTF-32 data points */ +typedef unsigned char utf8_t; /**< Type for UTF-8 data points */ +typedef unsigned short utf16_t; /**< Type for UTF-16 data points */ +typedef unsigned int utf32_t; /**< Type for UTF-32 data points */ #endif -#define LINEBREAK_MUSTBREAK 0 /**< Break is mandatory */ -#define LINEBREAK_ALLOWBREAK 1 /**< Break is allowed */ -#define LINEBREAK_NOBREAK 2 /**< No break is possible */ -#define LINEBREAK_INSIDEACHAR 3 /**< A UTF-8/16 sequence is unfinished */ +#define LINEBREAK_MUSTBREAK 0 /**< Break is mandatory */ +#define LINEBREAK_ALLOWBREAK 1 /**< Break is allowed */ +#define LINEBREAK_NOBREAK 2 /**< No break is possible */ +#define LINEBREAK_INSIDEACHAR 3 /**< A UTF-8/16 sequence is unfinished */ void init_linebreak(void); void set_linebreaks_utf8( - const utf8_t *s, size_t len, const char* lang, char *brks); + const utf8_t *s, size_t len, const char* lang, char *brks); void set_linebreaks_utf16( - const utf16_t *s, size_t len, const char* lang, char *brks); + const utf16_t *s, size_t len, const char* lang, char *brks); void set_linebreaks_utf32( - const utf32_t *s, size_t len, const char* lang, char *brks); + const utf32_t *s, size_t len, const char* lang, char *brks); int is_line_breakable(utf32_t char1, utf32_t char2, const char* lang); #ifdef __cplusplus diff --git a/src/static_libs/libunibreak/linebreakdata.c b/src/static_libs/libunibreak/linebreakdata.c index cced7d40bb..3843e3bfe7 100644 --- a/src/static_libs/libunibreak/linebreakdata.c +++ b/src/static_libs/libunibreak/linebreakdata.c @@ -1,6 +1,6 @@ /* The content of this file is generated from: -# LineBreak-6.2.0.txt -# Date: 2012-08-08, 19:26:00 GMT [KW] +# LineBreak-6.3.0.txt +# Date: 2013-02-06, 19:45:00 GMT [KW, LI] */ #include "linebreak.h" @@ -114,7 +114,9 @@ struct LineBreakProperties lb_prop_default[] = { { 0x060C, 0x060D, LBP_IS }, { 0x060E, 0x060F, LBP_AL }, { 0x0610, 0x061A, LBP_CM }, - { 0x061B, 0x061F, LBP_EX }, + { 0x061B, 0x061B, LBP_EX }, + { 0x061C, 0x061C, LBP_CM }, + { 0x061E, 0x061F, LBP_EX }, { 0x0620, 0x064A, LBP_AL }, { 0x064B, 0x065F, LBP_CM }, { 0x0660, 0x0669, LBP_NU }, @@ -456,7 +458,7 @@ struct LineBreakProperties lb_prop_default[] = { { 0x205D, 0x205F, LBP_BA }, { 0x2060, 0x2060, LBP_WJ }, { 0x2061, 0x2064, LBP_AL }, - { 0x206A, 0x206F, LBP_CM }, + { 0x2066, 0x206F, LBP_CM }, { 0x2070, 0x2071, LBP_AL }, { 0x2074, 0x2074, LBP_AI }, { 0x2075, 0x207C, LBP_AL }, @@ -473,7 +475,7 @@ struct LineBreakProperties lb_prop_default[] = { { 0x20A7, 0x20A7, LBP_PO }, { 0x20A8, 0x20B5, LBP_PR }, { 0x20B6, 0x20B6, LBP_PO }, - { 0x20B7, 0x20BA, LBP_PR }, + { 0x20B7, 0x20CF, LBP_PR }, { 0x20D0, 0x20F0, LBP_CM }, { 0x2100, 0x2102, LBP_AL }, { 0x2103, 0x2103, LBP_PO }, @@ -774,7 +776,8 @@ struct LineBreakProperties lb_prop_default[] = { { 0x2E33, 0x2E34, LBP_BA }, { 0x2E35, 0x2E39, LBP_AL }, { 0x2E3A, 0x2E3B, LBP_B2 }, - { 0x2E80, 0x3000, LBP_ID }, + { 0x2E80, 0x2FFB, LBP_ID }, + { 0x3000, 0x3000, LBP_BA }, { 0x3001, 0x3002, LBP_CL }, { 0x3003, 0x3004, LBP_ID }, { 0x3005, 0x3005, LBP_NS }, @@ -803,7 +806,9 @@ struct LineBreakProperties lb_prop_default[] = { { 0x301E, 0x301F, LBP_CL }, { 0x3020, 0x3029, LBP_ID }, { 0x302A, 0x302F, LBP_CM }, - { 0x3030, 0x303A, LBP_ID }, + { 0x3030, 0x3034, LBP_ID }, + { 0x3035, 0x3035, LBP_CM }, + { 0x3036, 0x303A, LBP_ID }, { 0x303B, 0x303C, LBP_NS }, { 0x303D, 0x303F, LBP_ID }, { 0x3041, 0x3041, LBP_CJ }, diff --git a/src/static_libs/libunibreak/linebreakdef.c b/src/static_libs/libunibreak/linebreakdef.c index fb6744da33..3455afd687 100644 --- a/src/static_libs/libunibreak/linebreakdef.c +++ b/src/static_libs/libunibreak/linebreakdef.c @@ -1,4 +1,4 @@ -/* vim: set tabstop=4 shiftwidth=4: */ +/* vim: set expandtab tabstop=4 softtabstop=4 shiftwidth=4: */ /* * Line breaking in a Unicode sequence. Designed to be used in a @@ -24,27 +24,27 @@ * distribution. * * The main reference is Unicode Standard Annex 14 (UAX #14): - * + * * * When this library was designed, this annex was at Revision 19, for * Unicode 5.0.0: - * + * * * This library has been updated according to Revision 30, for * Unicode 6.2.0: - * + * * * The Unicode Terms of Use are available at - * + * */ /** - * @file linebreakdef.c + * @file linebreakdef.c * * Definition of language-specific data. * - * @version 2.2, 2012/10/06 - * @author Wu Yongwei + * @version 2.2, 2012/10/06 + * @author Wu Yongwei */ #include "linebreak.h" @@ -54,72 +54,72 @@ * English-specifc data over the default Unicode rules. */ static struct LineBreakProperties lb_prop_English[] = { - { 0x2018, 0x2018, LBP_OP }, /* Left single quotation mark: opening */ - { 0x201C, 0x201C, LBP_OP }, /* Left double quotation mark: opening */ - { 0x201D, 0x201D, LBP_CL }, /* Right double quotation mark: closing */ - { 0, 0, LBP_Undefined } + { 0x2018, 0x2018, LBP_OP }, /* Left single quotation mark: opening */ + { 0x201C, 0x201C, LBP_OP }, /* Left double quotation mark: opening */ + { 0x201D, 0x201D, LBP_CL }, /* Right double quotation mark: closing */ + { 0, 0, LBP_Undefined } }; /** * German-specifc data over the default Unicode rules. */ static struct LineBreakProperties lb_prop_German[] = { - { 0x00AB, 0x00AB, LBP_CL }, /* Left double angle quotation mark: closing */ - { 0x00BB, 0x00BB, LBP_OP }, /* Right double angle quotation mark: opening */ - { 0x2018, 0x2018, LBP_CL }, /* Left single quotation mark: closing */ - { 0x201C, 0x201C, LBP_CL }, /* Left double quotation mark: closing */ - { 0x2039, 0x2039, LBP_CL }, /* Left single angle quotation mark: closing */ - { 0x203A, 0x203A, LBP_OP }, /* Right single angle quotation mark: opening */ - { 0, 0, LBP_Undefined } + { 0x00AB, 0x00AB, LBP_CL }, /* Left double angle quotation mark: closing */ + { 0x00BB, 0x00BB, LBP_OP }, /* Right double angle quotation mark: opening */ + { 0x2018, 0x2018, LBP_CL }, /* Left single quotation mark: closing */ + { 0x201C, 0x201C, LBP_CL }, /* Left double quotation mark: closing */ + { 0x2039, 0x2039, LBP_CL }, /* Left single angle quotation mark: closing */ + { 0x203A, 0x203A, LBP_OP }, /* Right single angle quotation mark: opening */ + { 0, 0, LBP_Undefined } }; /** * Spanish-specifc data over the default Unicode rules. */ static struct LineBreakProperties lb_prop_Spanish[] = { - { 0x00AB, 0x00AB, LBP_OP }, /* Left double angle quotation mark: opening */ - { 0x00BB, 0x00BB, LBP_CL }, /* Right double angle quotation mark: closing */ - { 0x2018, 0x2018, LBP_OP }, /* Left single quotation mark: opening */ - { 0x201C, 0x201C, LBP_OP }, /* Left double quotation mark: opening */ - { 0x201D, 0x201D, LBP_CL }, /* Right double quotation mark: closing */ - { 0x2039, 0x2039, LBP_OP }, /* Left single angle quotation mark: opening */ - { 0x203A, 0x203A, LBP_CL }, /* Right single angle quotation mark: closing */ - { 0, 0, LBP_Undefined } + { 0x00AB, 0x00AB, LBP_OP }, /* Left double angle quotation mark: opening */ + { 0x00BB, 0x00BB, LBP_CL }, /* Right double angle quotation mark: closing */ + { 0x2018, 0x2018, LBP_OP }, /* Left single quotation mark: opening */ + { 0x201C, 0x201C, LBP_OP }, /* Left double quotation mark: opening */ + { 0x201D, 0x201D, LBP_CL }, /* Right double quotation mark: closing */ + { 0x2039, 0x2039, LBP_OP }, /* Left single angle quotation mark: opening */ + { 0x203A, 0x203A, LBP_CL }, /* Right single angle quotation mark: closing */ + { 0, 0, LBP_Undefined } }; /** * French-specifc data over the default Unicode rules. */ static struct LineBreakProperties lb_prop_French[] = { - { 0x00AB, 0x00AB, LBP_OP }, /* Left double angle quotation mark: opening */ - { 0x00BB, 0x00BB, LBP_CL }, /* Right double angle quotation mark: closing */ - { 0x2018, 0x2018, LBP_OP }, /* Left single quotation mark: opening */ - { 0x201C, 0x201C, LBP_OP }, /* Left double quotation mark: opening */ - { 0x201D, 0x201D, LBP_CL }, /* Right double quotation mark: closing */ - { 0x2039, 0x2039, LBP_OP }, /* Left single angle quotation mark: opening */ - { 0x203A, 0x203A, LBP_CL }, /* Right single angle quotation mark: closing */ - { 0, 0, LBP_Undefined } + { 0x00AB, 0x00AB, LBP_OP }, /* Left double angle quotation mark: opening */ + { 0x00BB, 0x00BB, LBP_CL }, /* Right double angle quotation mark: closing */ + { 0x2018, 0x2018, LBP_OP }, /* Left single quotation mark: opening */ + { 0x201C, 0x201C, LBP_OP }, /* Left double quotation mark: opening */ + { 0x201D, 0x201D, LBP_CL }, /* Right double quotation mark: closing */ + { 0x2039, 0x2039, LBP_OP }, /* Left single angle quotation mark: opening */ + { 0x203A, 0x203A, LBP_CL }, /* Right single angle quotation mark: closing */ + { 0, 0, LBP_Undefined } }; /** * Russian-specifc data over the default Unicode rules. */ static struct LineBreakProperties lb_prop_Russian[] = { - { 0x00AB, 0x00AB, LBP_OP }, /* Left double angle quotation mark: opening */ - { 0x00BB, 0x00BB, LBP_CL }, /* Right double angle quotation mark: closing */ - { 0x201C, 0x201C, LBP_CL }, /* Left double quotation mark: closing */ - { 0, 0, LBP_Undefined } + { 0x00AB, 0x00AB, LBP_OP }, /* Left double angle quotation mark: opening */ + { 0x00BB, 0x00BB, LBP_CL }, /* Right double angle quotation mark: closing */ + { 0x201C, 0x201C, LBP_CL }, /* Left double quotation mark: closing */ + { 0, 0, LBP_Undefined } }; /** * Chinese-specifc data over the default Unicode rules. */ static struct LineBreakProperties lb_prop_Chinese[] = { - { 0x2018, 0x2018, LBP_OP }, /* Left single quotation mark: opening */ - { 0x2019, 0x2019, LBP_CL }, /* Right single quotation mark: closing */ - { 0x201C, 0x201C, LBP_OP }, /* Left double quotation mark: opening */ - { 0x201D, 0x201D, LBP_CL }, /* Right double quotation mark: closing */ - { 0, 0, LBP_Undefined } + { 0x2018, 0x2018, LBP_OP }, /* Left single quotation mark: opening */ + { 0x2019, 0x2019, LBP_CL }, /* Right single quotation mark: closing */ + { 0x201C, 0x201C, LBP_OP }, /* Left double quotation mark: opening */ + { 0x201D, 0x201D, LBP_CL }, /* Right double quotation mark: closing */ + { 0, 0, LBP_Undefined } }; /** @@ -129,11 +129,11 @@ static struct LineBreakProperties lb_prop_Chinese[] = { * you may want to redefine \e lb_prop_lang_map in your C source file. */ struct LineBreakPropertiesLang lb_prop_lang_map[] = { - { "en", 2, lb_prop_English }, - { "de", 2, lb_prop_German }, - { "es", 2, lb_prop_Spanish }, - { "fr", 2, lb_prop_French }, - { "ru", 2, lb_prop_Russian }, - { "zh", 2, lb_prop_Chinese }, - { NULL, 0, NULL } + { "en", 2, lb_prop_English }, + { "de", 2, lb_prop_German }, + { "es", 2, lb_prop_Spanish }, + { "fr", 2, lb_prop_French }, + { "ru", 2, lb_prop_Russian }, + { "zh", 2, lb_prop_Chinese }, + { NULL, 0, NULL } }; diff --git a/src/static_libs/libunibreak/linebreakdef.h b/src/static_libs/libunibreak/linebreakdef.h index 93fcd6781c..d557aba109 100644 --- a/src/static_libs/libunibreak/linebreakdef.h +++ b/src/static_libs/libunibreak/linebreakdef.h @@ -1,10 +1,11 @@ -/* vim: set tabstop=4 shiftwidth=4: */ +/* vim: set expandtab tabstop=4 softtabstop=4 shiftwidth=4: */ /* * Line breaking in a Unicode sequence. Designed to be used in a * generic text renderer. * - * Copyright (C) 2008-2012 Wu Yongwei + * Copyright (C) 2008-2013 Wu Yongwei + * Copyright (C) 2013 Petr Filipsky * * This software is provided 'as-is', without any express or implied * warranty. In no event will the author be held liable for any damages @@ -24,35 +25,36 @@ * distribution. * * The main reference is Unicode Standard Annex 14 (UAX #14): - * + * * * When this library was designed, this annex was at Revision 19, for * Unicode 5.0.0: - * + * * * This library has been updated according to Revision 30, for * Unicode 6.2.0: - * + * * * The Unicode Terms of Use are available at - * + * */ /** - * @file linebreakdef.h + * @file linebreakdef.h * * Definitions of internal data structures, declarations of global * variables, and function prototypes for the line breaking algorithm. * - * @version 2.3, 2012/10/06 - * @author Wu Yongwei + * @version 2.4, 2013/11/10 + * @author Wu Yongwei + * @author Petr Filipsky */ /** * Constant value to mark the end of string. It is not a valid Unicode * character. */ -#define EOS 0xFFFF +#define EOS 0xFFFFFFFF /** * Line break classes. This is a direct mapping of Table 1 of Unicode @@ -60,52 +62,52 @@ */ enum LineBreakClass { - /* This is used to signal an error condition. */ - LBP_Undefined, /**< Undefined */ + /* This is used to signal an error condition. */ + LBP_Undefined, /**< Undefined */ - /* The following break classes are treated in the pair table. */ - LBP_OP, /**< Opening punctuation */ - LBP_CL, /**< Closing punctuation */ - LBP_CP, /**< Closing parenthesis */ - LBP_QU, /**< Ambiguous quotation */ - LBP_GL, /**< Glue */ - LBP_NS, /**< Non-starters */ - LBP_EX, /**< Exclamation/Interrogation */ - LBP_SY, /**< Symbols allowing break after */ - LBP_IS, /**< Infix separator */ - LBP_PR, /**< Prefix */ - LBP_PO, /**< Postfix */ - LBP_NU, /**< Numeric */ - LBP_AL, /**< Alphabetic */ - LBP_HL, /**< Hebrew letter */ - LBP_ID, /**< Ideographic */ - LBP_IN, /**< Inseparable characters */ - LBP_HY, /**< Hyphen */ - LBP_BA, /**< Break after */ - LBP_BB, /**< Break before */ - LBP_B2, /**< Break on either side (but not pair) */ - LBP_ZW, /**< Zero-width space */ - LBP_CM, /**< Combining marks */ - LBP_WJ, /**< Word joiner */ - LBP_H2, /**< Hangul LV */ - LBP_H3, /**< Hangul LVT */ - LBP_JL, /**< Hangul L Jamo */ - LBP_JV, /**< Hangul V Jamo */ - LBP_JT, /**< Hangul T Jamo */ - LBP_RI, /**< Regional indicator */ + /* The following break classes are treated in the pair table. */ + LBP_OP, /**< Opening punctuation */ + LBP_CL, /**< Closing punctuation */ + LBP_CP, /**< Closing parenthesis */ + LBP_QU, /**< Ambiguous quotation */ + LBP_GL, /**< Glue */ + LBP_NS, /**< Non-starters */ + LBP_EX, /**< Exclamation/Interrogation */ + LBP_SY, /**< Symbols allowing break after */ + LBP_IS, /**< Infix separator */ + LBP_PR, /**< Prefix */ + LBP_PO, /**< Postfix */ + LBP_NU, /**< Numeric */ + LBP_AL, /**< Alphabetic */ + LBP_HL, /**< Hebrew letter */ + LBP_ID, /**< Ideographic */ + LBP_IN, /**< Inseparable characters */ + LBP_HY, /**< Hyphen */ + LBP_BA, /**< Break after */ + LBP_BB, /**< Break before */ + LBP_B2, /**< Break on either side (but not pair) */ + LBP_ZW, /**< Zero-width space */ + LBP_CM, /**< Combining marks */ + LBP_WJ, /**< Word joiner */ + LBP_H2, /**< Hangul LV */ + LBP_H3, /**< Hangul LVT */ + LBP_JL, /**< Hangul L Jamo */ + LBP_JV, /**< Hangul V Jamo */ + LBP_JT, /**< Hangul T Jamo */ + LBP_RI, /**< Regional indicator */ - /* The following break classes are not treated in the pair table */ - LBP_AI, /**< Ambiguous (alphabetic or ideograph) */ - LBP_BK, /**< Break (mandatory) */ - LBP_CB, /**< Contingent break */ - LBP_CJ, /**< Conditional Japanese starter */ - LBP_CR, /**< Carriage return */ - LBP_LF, /**< Line feed */ - LBP_NL, /**< Next line */ - LBP_SA, /**< South-East Asian */ - LBP_SG, /**< Surrogates */ - LBP_SP, /**< Space */ - LBP_XX /**< Unknown */ + /* The following break classes are not treated in the pair table */ + LBP_AI, /**< Ambiguous (alphabetic or ideograph) */ + LBP_BK, /**< Break (mandatory) */ + LBP_CB, /**< Contingent break */ + LBP_CJ, /**< Conditional Japanese starter */ + LBP_CR, /**< Carriage return */ + LBP_LF, /**< Line feed */ + LBP_NL, /**< Next line */ + LBP_SA, /**< South-East Asian */ + LBP_SG, /**< Surrogates */ + LBP_SP, /**< Space */ + LBP_XX /**< Unknown */ }; /** @@ -114,9 +116,9 @@ enum LineBreakClass */ struct LineBreakProperties { - utf32_t start; /**< Starting coding point */ - utf32_t end; /**< End coding point */ - enum LineBreakClass prop; /**< The line breaking property */ + utf32_t start; /**< Starting coding point */ + utf32_t end; /**< End coding point */ + enum LineBreakClass prop; /**< The line breaking property */ }; /** @@ -125,9 +127,22 @@ struct LineBreakProperties */ struct LineBreakPropertiesLang { - const char *lang; /**< Language name */ - size_t namelen; /**< Length of name to match */ - struct LineBreakProperties *lbp; /**< Pointer to associated data */ + const char *lang; /**< Language name */ + size_t namelen; /**< Length of name to match */ + struct LineBreakProperties *lbp; /**< Pointer to associated data */ +}; + +/** + * Context representing internal state of the line breaking algorithm. + * This is useful to callers if incremental analysis is wanted. + */ +struct LineBreakContext +{ + const char *lang; /**< Language name */ + struct LineBreakProperties *lbpLang;/**< Pointer to LineBreakProperties */ + enum LineBreakClass lbcCur; /**< Breaking class of current codepoint */ + enum LineBreakClass lbcNew; /**< Breaking class of next codepoint */ + enum LineBreakClass lbcLast; /**< Breaking class of last codepoint */ }; /** @@ -144,9 +159,16 @@ extern struct LineBreakPropertiesLang lb_prop_lang_map[]; utf32_t lb_get_next_char_utf8(const utf8_t *s, size_t len, size_t *ip); utf32_t lb_get_next_char_utf16(const utf16_t *s, size_t len, size_t *ip); utf32_t lb_get_next_char_utf32(const utf32_t *s, size_t len, size_t *ip); +void lb_init_break_context( + struct LineBreakContext* lbpCtx, + utf32_t ch, + const char* lang); +int lb_process_next_char( + struct LineBreakContext* lbpCtx, + utf32_t ch); void set_linebreaks( - const void *s, - size_t len, - const char *lang, - char *brks, - get_next_char_t get_next_char); + const void *s, + size_t len, + const char *lang, + char *brks, + get_next_char_t get_next_char); diff --git a/src/static_libs/libunibreak/wordbreak.c b/src/static_libs/libunibreak/wordbreak.c index f2996c0e81..e67a1f8507 100644 --- a/src/static_libs/libunibreak/wordbreak.c +++ b/src/static_libs/libunibreak/wordbreak.c @@ -1,10 +1,10 @@ -/* vim: set tabstop=4 shiftwidth=4: */ +/* vim: set expandtab tabstop=4 softtabstop=4 shiftwidth=4: */ /* * Word breaking in a Unicode sequence. Designed to be used in a * generic text renderer. * - * Copyright (C) 2012 Tom Hacohen + * Copyright (C) 2013 Tom Hacohen * * This software is provided 'as-is', without any express or implied * warranty. In no event will the author be held liable for any damages @@ -24,24 +24,28 @@ * distribution. * * The main reference is Unicode Standard Annex 29 (UAX #29): - * + * * * When this library was designed, this annex was at Revision 17, for * Unicode 6.0.0: - * + * + * + * This library has been updated according to Revision 21, for + * Unicode 6.2.0: + * * * The Unicode Terms of Use are available at - * + * */ /** - * @file wordbreak.c + * @file wordbreak.c * * Implementation of the word breaking algorithm as described in Unicode * Standard Annex 29. * - * @version 2.3, 2013/05/14 - * @author Tom Hacohen + * @version 2.4, 2013/09/28 + * @author Tom Hacohen */ #include @@ -66,34 +70,34 @@ void init_wordbreak(void) /** * Gets the word breaking class of a character. * - * @param ch character to check - * @param wbp pointer to the wbp breaking properties array - * @param len size of the wbp array in number of items - * @return the word breaking class if found; \c WBP_Any otherwise + * @param ch character to check + * @param wbp pointer to the wbp breaking properties array + * @param len size of the wbp array in number of items + * @return the word breaking class if found; \c WBP_Any otherwise */ static enum WordBreakClass get_char_wb_class( - utf32_t ch, - struct WordBreakProperties *wbp, - size_t len) + utf32_t ch, + struct WordBreakProperties *wbp, + size_t len) { - int min = 0; - int max = len - 1; - int mid; + int min = 0; + int max = len - 1; + int mid; - do - { - mid = (min + max) / 2; + do + { + mid = (min + max) / 2; - if (ch < wbp[mid].start) - max = mid - 1; - else if (ch > wbp[mid].end) - min = mid + 1; - else - return wbp[mid].prop; - } - while (min <= max); + if (ch < wbp[mid].start) + max = mid - 1; + else if (ch > wbp[mid].end) + min = mid + 1; + else + return wbp[mid].prop; + } + while (min <= max); - return WBP_Any; + return WBP_Any; } /** @@ -103,346 +107,346 @@ static enum WordBreakClass get_char_wb_class( * Assumes \a brks is initialized - all the cells with #WORDBREAK_NOBREAK are * cells that we really don't want to break after. * - * @param[in] s input string - * @param[out] brks breaks array to fill - * @param[in] posStart start position - * @param[in] posEnd end position (exclusive) - * @param[in] len length of the string - * @param[in] brkType breaks type to use - * @param[in] get_next_char function to get the next UTF-32 character + * @param[in] s input string + * @param[out] brks breaks array to fill + * @param[in] posStart start position + * @param[in] posEnd end position (exclusive) + * @param[in] len length of the string + * @param[in] brkType breaks type to use + * @param[in] get_next_char function to get the next UTF-32 character */ static void set_brks_to( - const void *s, - char *brks, - size_t posStart, - size_t posEnd, - size_t len, - char brkType, - get_next_char_t get_next_char) + const void *s, + char *brks, + size_t posStart, + size_t posEnd, + size_t len, + char brkType, + get_next_char_t get_next_char) { - size_t posNext = posStart; - while (posNext < posEnd) - { - utf32_t ch; - ch = get_next_char(s, len, &posNext); - assert(ch != EOS); - for (; posStart < posNext - 1; ++posStart) - brks[posStart] = WORDBREAK_INSIDEACHAR; - assert(posStart == posNext - 1); + size_t posNext = posStart; + while (posNext < posEnd) + { + utf32_t ch; + ch = get_next_char(s, len, &posNext); + assert(ch != EOS); + for (; posStart < posNext - 1; ++posStart) + brks[posStart] = WORDBREAK_INSIDEACHAR; + assert(posStart == posNext - 1); - /* Only set it if we haven't set it not to break before. */ - if (brks[posStart] != WORDBREAK_NOBREAK) - brks[posStart] = brkType; - posStart = posNext; - } + /* Only set it if we haven't set it not to break before. */ + if (brks[posStart] != WORDBREAK_NOBREAK) + brks[posStart] = brkType; + posStart = posNext; + } } /* Checks to see if the class is newline, CR, or LF (rules WB3a and b). */ #define IS_WB3ab(cls) ((cls == WBP_Newline) || (cls == WBP_CR) || \ - (cls == WBP_LF)) + (cls == WBP_LF)) /** * Sets the word breaking information for a generic input string. * - * @param[in] s input string - * @param[in] len length of the input - * @param[in] lang language of the input - * @param[out] brks pointer to the output breaking data, containing - * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or - * #WORDBREAK_INSIDEACHAR - * @param[in] get_next_char function to get the next UTF-32 character + * @param[in] s input string + * @param[in] len length of the input + * @param[in] lang language of the input + * @param[out] brks pointer to the output breaking data, containing + * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or + * #WORDBREAK_INSIDEACHAR + * @param[in] get_next_char function to get the next UTF-32 character */ static void set_wordbreaks( - const void *s, - size_t len, - const char *lang, - char *brks, - get_next_char_t get_next_char) + const void *s, + size_t len, + const char *lang, + char *brks, + get_next_char_t get_next_char) { - enum WordBreakClass wbcLast = WBP_Undefined; - /* wbcSeqStart is the class that started the current sequence. - * WBP_Undefined is a special case that means "sot". - * This value is the class that is at the start of the current rule - * matching sequence. For example, in case of Numeric+MidNum+Numeric - * it'll be Numeric all the way. - */ - enum WordBreakClass wbcSeqStart = WBP_Undefined; - utf32_t ch; - size_t posNext = 0; - size_t posCur = 0; - size_t posLast = 0; + enum WordBreakClass wbcLast = WBP_Undefined; + /* wbcSeqStart is the class that started the current sequence. + * WBP_Undefined is a special case that means "sot". + * This value is the class that is at the start of the current rule + * matching sequence. For example, in case of Numeric+MidNum+Numeric + * it'll be Numeric all the way. + */ + enum WordBreakClass wbcSeqStart = WBP_Undefined; + utf32_t ch; + size_t posNext = 0; + size_t posCur = 0; + size_t posLast = 0; - /* TODO: Language-specific specialization. */ - (void) lang; + /* TODO: Language-specific specialization. */ + (void) lang; - /* Init brks. */ - memset(brks, WORDBREAK_BREAK, len); + /* Init brks. */ + memset(brks, WORDBREAK_BREAK, len); - ch = get_next_char(s, len, &posNext); + ch = get_next_char(s, len, &posNext); - while (ch != EOS) - { - enum WordBreakClass wbcCur; - wbcCur = get_char_wb_class(ch, wb_prop_default, - ARRAY_LEN(wb_prop_default)); + while (ch != EOS) + { + enum WordBreakClass wbcCur; + wbcCur = get_char_wb_class(ch, wb_prop_default, + ARRAY_LEN(wb_prop_default)); - switch (wbcCur) - { - case WBP_CR: - /* WB3b */ - set_brks_to(s, brks, posLast, posCur, len, - WORDBREAK_BREAK, get_next_char); - wbcSeqStart = wbcCur; - posLast = posCur; - break; + switch (wbcCur) + { + case WBP_CR: + /* WB3b */ + set_brks_to(s, brks, posLast, posCur, len, + WORDBREAK_BREAK, get_next_char); + wbcSeqStart = wbcCur; + posLast = posCur; + break; - case WBP_LF: - if (wbcSeqStart == WBP_CR) /* WB3 */ - { - set_brks_to(s, brks, posLast, posCur, len, - WORDBREAK_NOBREAK, get_next_char); - wbcSeqStart = wbcCur; - posLast = posCur; - break; - } - /* Fall off */ + case WBP_LF: + if (wbcSeqStart == WBP_CR) /* WB3 */ + { + set_brks_to(s, brks, posLast, posCur, len, + WORDBREAK_NOBREAK, get_next_char); + wbcSeqStart = wbcCur; + posLast = posCur; + break; + } + /* Fall off */ - case WBP_Newline: - /* WB3a,3b */ - set_brks_to(s, brks, posLast, posCur, len, - WORDBREAK_BREAK, get_next_char); - wbcSeqStart = wbcCur; - posLast = posCur; - break; + case WBP_Newline: + /* WB3a,3b */ + set_brks_to(s, brks, posLast, posCur, len, + WORDBREAK_BREAK, get_next_char); + wbcSeqStart = wbcCur; + posLast = posCur; + break; - case WBP_Extend: - case WBP_Format: - /* WB4 - If not the first char/after a newline (WB3a,3b), skip - * this class, set it to be the same as the prev, and mark - * brks not to break before them. */ - if ((wbcSeqStart == WBP_Undefined) || IS_WB3ab(wbcSeqStart)) - { - set_brks_to(s, brks, posLast, posCur, len, - WORDBREAK_BREAK, get_next_char); - wbcSeqStart = wbcCur; - } - else - { - /* It's surely not the first */ - brks[posCur - 1] = WORDBREAK_NOBREAK; - /* "inherit" the previous class. */ - wbcCur = wbcLast; - } - break; + case WBP_Extend: + case WBP_Format: + /* WB4 - If not the first char/after a newline (WB3a,3b), skip + * this class, set it to be the same as the prev, and mark + * brks not to break before them. */ + if ((wbcSeqStart == WBP_Undefined) || IS_WB3ab(wbcSeqStart)) + { + set_brks_to(s, brks, posLast, posCur, len, + WORDBREAK_BREAK, get_next_char); + wbcSeqStart = wbcCur; + } + else + { + /* It's surely not the first */ + brks[posCur - 1] = WORDBREAK_NOBREAK; + /* "inherit" the previous class. */ + wbcCur = wbcLast; + } + break; - case WBP_Katakana: - if ((wbcSeqStart == WBP_Katakana) || /* WB13 */ - (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */ - { - set_brks_to(s, brks, posLast, posCur, len, - WORDBREAK_NOBREAK, get_next_char); - } - /* No rule found, reset */ - else - { - set_brks_to(s, brks, posLast, posCur, len, - WORDBREAK_BREAK, get_next_char); - } - wbcSeqStart = wbcCur; - posLast = posCur; - break; + case WBP_Katakana: + if ((wbcSeqStart == WBP_Katakana) || /* WB13 */ + (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */ + { + set_brks_to(s, brks, posLast, posCur, len, + WORDBREAK_NOBREAK, get_next_char); + } + /* No rule found, reset */ + else + { + set_brks_to(s, brks, posLast, posCur, len, + WORDBREAK_BREAK, get_next_char); + } + wbcSeqStart = wbcCur; + posLast = posCur; + break; - case WBP_ALetter: - if ((wbcSeqStart == WBP_ALetter) || /* WB5,6,7 */ - (wbcLast == WBP_Numeric) || /* WB10 */ - (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */ - { - set_brks_to(s, brks, posLast, posCur, len, - WORDBREAK_NOBREAK, get_next_char); - } - /* No rule found, reset */ - else - { - set_brks_to(s, brks, posLast, posCur, len, - WORDBREAK_BREAK, get_next_char); - } - wbcSeqStart = wbcCur; - posLast = posCur; - break; + case WBP_ALetter: + if ((wbcSeqStart == WBP_ALetter) || /* WB5,6,7 */ + (wbcLast == WBP_Numeric) || /* WB10 */ + (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */ + { + set_brks_to(s, brks, posLast, posCur, len, + WORDBREAK_NOBREAK, get_next_char); + } + /* No rule found, reset */ + else + { + set_brks_to(s, brks, posLast, posCur, len, + WORDBREAK_BREAK, get_next_char); + } + wbcSeqStart = wbcCur; + posLast = posCur; + break; - case WBP_MidNumLet: - if ((wbcLast == WBP_ALetter) || /* WB6,7 */ - (wbcLast == WBP_Numeric)) /* WB11,12 */ - { - /* Go on */ - } - else - { - set_brks_to(s, brks, posLast, posCur, len, - WORDBREAK_BREAK, get_next_char); - wbcSeqStart = wbcCur; - posLast = posCur; - } - break; + case WBP_MidNumLet: + if ((wbcLast == WBP_ALetter) || /* WB6,7 */ + (wbcLast == WBP_Numeric)) /* WB11,12 */ + { + /* Go on */ + } + else + { + set_brks_to(s, brks, posLast, posCur, len, + WORDBREAK_BREAK, get_next_char); + wbcSeqStart = wbcCur; + posLast = posCur; + } + break; - case WBP_MidLetter: - if (wbcLast == WBP_ALetter) /* WB6,7 */ - { - /* Go on */ - } - else - { - set_brks_to(s, brks, posLast, posCur, len, - WORDBREAK_BREAK, get_next_char); - wbcSeqStart = wbcCur; - posLast = posCur; - } - break; + case WBP_MidLetter: + if (wbcLast == WBP_ALetter) /* WB6,7 */ + { + /* Go on */ + } + else + { + set_brks_to(s, brks, posLast, posCur, len, + WORDBREAK_BREAK, get_next_char); + wbcSeqStart = wbcCur; + posLast = posCur; + } + break; - case WBP_MidNum: - if (wbcLast == WBP_Numeric) /* WB11,12 */ - { - /* Go on */ - } - else - { - set_brks_to(s, brks, posLast, posCur, len, - WORDBREAK_BREAK, get_next_char); - wbcSeqStart = wbcCur; - posLast = posCur; - } - break; + case WBP_MidNum: + if (wbcLast == WBP_Numeric) /* WB11,12 */ + { + /* Go on */ + } + else + { + set_brks_to(s, brks, posLast, posCur, len, + WORDBREAK_BREAK, get_next_char); + wbcSeqStart = wbcCur; + posLast = posCur; + } + break; - case WBP_Numeric: - if ((wbcSeqStart == WBP_Numeric) || /* WB8,11,12 */ - (wbcLast == WBP_ALetter) || /* WB9 */ - (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */ - { - set_brks_to(s, brks, posLast, posCur, len, - WORDBREAK_NOBREAK, get_next_char); - } - /* No rule found, reset */ - else - { - set_brks_to(s, brks, posLast, posCur, len, - WORDBREAK_BREAK, get_next_char); - } - wbcSeqStart = wbcCur; - posLast = posCur; - break; + case WBP_Numeric: + if ((wbcSeqStart == WBP_Numeric) || /* WB8,11,12 */ + (wbcLast == WBP_ALetter) || /* WB9 */ + (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */ + { + set_brks_to(s, brks, posLast, posCur, len, + WORDBREAK_NOBREAK, get_next_char); + } + /* No rule found, reset */ + else + { + set_brks_to(s, brks, posLast, posCur, len, + WORDBREAK_BREAK, get_next_char); + } + wbcSeqStart = wbcCur; + posLast = posCur; + break; - case WBP_ExtendNumLet: - /* WB13a,13b */ - if ((wbcSeqStart == wbcLast) && - ((wbcLast == WBP_ALetter) || - (wbcLast == WBP_Numeric) || - (wbcLast == WBP_Katakana) || - (wbcLast == WBP_ExtendNumLet))) - { - set_brks_to(s, brks, posLast, posCur, len, - WORDBREAK_NOBREAK, get_next_char); - } - /* No rule found, reset */ - else - { - set_brks_to(s, brks, posLast, posCur, len, - WORDBREAK_BREAK, get_next_char); - } - wbcSeqStart = wbcCur; - posLast = posCur; - break; + case WBP_ExtendNumLet: + /* WB13a,13b */ + if ((wbcSeqStart == wbcLast) && + ((wbcLast == WBP_ALetter) || + (wbcLast == WBP_Numeric) || + (wbcLast == WBP_Katakana) || + (wbcLast == WBP_ExtendNumLet))) + { + set_brks_to(s, brks, posLast, posCur, len, + WORDBREAK_NOBREAK, get_next_char); + } + /* No rule found, reset */ + else + { + set_brks_to(s, brks, posLast, posCur, len, + WORDBREAK_BREAK, get_next_char); + } + wbcSeqStart = wbcCur; + posLast = posCur; + break; - case WBP_Regional: - /* WB13c */ - if (wbcSeqStart == WBP_Regional) - { - set_brks_to(s, brks, posLast, posCur, len, - WORDBREAK_NOBREAK, get_next_char); - } - wbcSeqStart = wbcCur; - posLast = posCur; - break; + case WBP_Regional: + /* WB13c */ + if (wbcSeqStart == WBP_Regional) + { + set_brks_to(s, brks, posLast, posCur, len, + WORDBREAK_NOBREAK, get_next_char); + } + wbcSeqStart = wbcCur; + posLast = posCur; + break; - case WBP_Any: - /* Allow breaks and reset */ - set_brks_to(s, brks, posLast, posCur, len, - WORDBREAK_BREAK, get_next_char); - wbcSeqStart = wbcCur; - posLast = posCur; - break; + case WBP_Any: + /* Allow breaks and reset */ + set_brks_to(s, brks, posLast, posCur, len, + WORDBREAK_BREAK, get_next_char); + wbcSeqStart = wbcCur; + posLast = posCur; + break; - default: - /* Error, should never get here! */ - assert(0); - break; - } + default: + /* Error, should never get here! */ + assert(0); + break; + } - wbcLast = wbcCur; - posCur = posNext; - ch = get_next_char(s, len, &posNext); + wbcLast = wbcCur; + posCur = posNext; + ch = get_next_char(s, len, &posNext); } - /* WB2 */ - set_brks_to(s, brks, posLast, posNext, len, - WORDBREAK_BREAK, get_next_char); + /* WB2 */ + set_brks_to(s, brks, posLast, posNext, len, + WORDBREAK_BREAK, get_next_char); } /** * Sets the word breaking information for a UTF-8 input string. * - * @param[in] s input UTF-8 string - * @param[in] len length of the input - * @param[in] lang language of the input - * @param[out] brks pointer to the output breaking data, containing - * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or - * #WORDBREAK_INSIDEACHAR + * @param[in] s input UTF-8 string + * @param[in] len length of the input + * @param[in] lang language of the input + * @param[out] brks pointer to the output breaking data, containing + * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or + * #WORDBREAK_INSIDEACHAR */ void set_wordbreaks_utf8( - const utf8_t *s, - size_t len, - const char *lang, - char *brks) + const utf8_t *s, + size_t len, + const char *lang, + char *brks) { - set_wordbreaks(s, len, lang, brks, - (get_next_char_t)lb_get_next_char_utf8); + set_wordbreaks(s, len, lang, brks, + (get_next_char_t)lb_get_next_char_utf8); } /** * Sets the word breaking information for a UTF-16 input string. * - * @param[in] s input UTF-16 string - * @param[in] len length of the input - * @param[in] lang language of the input - * @param[out] brks pointer to the output breaking data, containing - * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or - * #WORDBREAK_INSIDEACHAR + * @param[in] s input UTF-16 string + * @param[in] len length of the input + * @param[in] lang language of the input + * @param[out] brks pointer to the output breaking data, containing + * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or + * #WORDBREAK_INSIDEACHAR */ void set_wordbreaks_utf16( - const utf16_t *s, - size_t len, - const char *lang, - char *brks) + const utf16_t *s, + size_t len, + const char *lang, + char *brks) { - set_wordbreaks(s, len, lang, brks, - (get_next_char_t)lb_get_next_char_utf16); + set_wordbreaks(s, len, lang, brks, + (get_next_char_t)lb_get_next_char_utf16); } /** * Sets the word breaking information for a UTF-32 input string. * - * @param[in] s input UTF-32 string - * @param[in] len length of the input - * @param[in] lang language of the input - * @param[out] brks pointer to the output breaking data, containing - * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or - * #WORDBREAK_INSIDEACHAR + * @param[in] s input UTF-32 string + * @param[in] len length of the input + * @param[in] lang language of the input + * @param[out] brks pointer to the output breaking data, containing + * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or + * #WORDBREAK_INSIDEACHAR */ void set_wordbreaks_utf32( - const utf32_t *s, - size_t len, - const char *lang, - char *brks) + const utf32_t *s, + size_t len, + const char *lang, + char *brks) { - set_wordbreaks(s, len, lang, brks, - (get_next_char_t)lb_get_next_char_utf32); + set_wordbreaks(s, len, lang, brks, + (get_next_char_t)lb_get_next_char_utf32); } diff --git a/src/static_libs/libunibreak/wordbreak.h b/src/static_libs/libunibreak/wordbreak.h index 47bef274ce..cd2bf2c451 100644 --- a/src/static_libs/libunibreak/wordbreak.h +++ b/src/static_libs/libunibreak/wordbreak.h @@ -1,10 +1,10 @@ -/* vim: set tabstop=4 shiftwidth=4: */ +/* vim: set expandtab tabstop=4 softtabstop=4 shiftwidth=4: */ /* * Word breaking in a Unicode sequence. Designed to be used in a * generic text renderer. * - * Copyright (C) 2012 Tom Hacohen + * Copyright (C) 2013 Tom Hacohen * * This software is provided 'as-is', without any express or implied * warranty. In no event will the author be held liable for any damages @@ -24,23 +24,27 @@ * distribution. * * The main reference is Unicode Standard Annex 29 (UAX #29): - * + * * * When this library was designed, this annex was at Revision 17, for * Unicode 6.0.0: - * + * + * + * This library has been updated according to Revision 21, for + * Unicode 6.2.0: + * * * The Unicode Terms of Use are available at - * + * */ /** - * @file wordbreak.h + * @file wordbreak.h * * Header file for the word breaking (segmentation) algorithm. * - * @version 2.2, 2012/02/04 - * @author Tom Hacohen + * @version 2.3, 2013/09/28 + * @author Tom Hacohen */ #ifndef WORDBREAK_H @@ -53,17 +57,17 @@ extern "C" { #endif -#define WORDBREAK_BREAK 0 /**< Break is allowed */ -#define WORDBREAK_NOBREAK 1 /**< No break is allowed */ -#define WORDBREAK_INSIDEACHAR 2 /**< A UTF-8/16 sequence is unfinished */ +#define WORDBREAK_BREAK 0 /**< Break is allowed */ +#define WORDBREAK_NOBREAK 1 /**< No break is allowed */ +#define WORDBREAK_INSIDEACHAR 2 /**< A UTF-8/16 sequence is unfinished */ void init_wordbreak(void); void set_wordbreaks_utf8( - const utf8_t *s, size_t len, const char* lang, char *brks); + const utf8_t *s, size_t len, const char* lang, char *brks); void set_wordbreaks_utf16( - const utf16_t *s, size_t len, const char* lang, char *brks); + const utf16_t *s, size_t len, const char* lang, char *brks); void set_wordbreaks_utf32( - const utf32_t *s, size_t len, const char* lang, char *brks); + const utf32_t *s, size_t len, const char* lang, char *brks); #ifdef __cplusplus } diff --git a/src/static_libs/libunibreak/wordbreakdef.h b/src/static_libs/libunibreak/wordbreakdef.h index ca8b1b62c0..72816f9dc4 100644 --- a/src/static_libs/libunibreak/wordbreakdef.h +++ b/src/static_libs/libunibreak/wordbreakdef.h @@ -1,10 +1,11 @@ -/* vim: set tabstop=4 shiftwidth=4: */ +/* vim: set expandtab tabstop=4 softtabstop=4 shiftwidth=4: */ /* * Word breaking in a Unicode sequence. Designed to be used in a * generic text renderer. * - * Copyright (C) 2012 Tom Hacohen + * Copyright (C) 2013 Tom Hacohen + * Copyright (C) 2013 Petr Filipsky * * This software is provided 'as-is', without any express or implied * warranty. In no event will the author be held liable for any damages @@ -24,47 +25,55 @@ * distribution. * * The main reference is Unicode Standard Annex 29 (UAX #29): - * + * * * When this library was designed, this annex was at Revision 17, for * Unicode 6.0.0: - * + * + * + * This library has been updated according to Revision 21, for + * Unicode 6.2.0: + * * * The Unicode Terms of Use are available at - * + * */ /** - * @file wordbreakdef.h + * @file wordbreakdef.h * * Definitions of internal data structures, declarations of global * variables, and function prototypes for the word breaking algorithm. * - * @version 2.2, 2013/05/14 - * @author Tom Hacohen + * @version 2.4, 2013/11/10 + * @author Tom Hacohen + * @author Petr Filipsky */ /** * Word break classes. This is a direct mapping of Table 3 of Unicode - * Standard Annex 29, Revision 17. + * Standard Annex 29, Revision 23. */ enum WordBreakClass { - WBP_Undefined, - WBP_CR, - WBP_LF, - WBP_Newline, - WBP_Extend, - WBP_Format, - WBP_Katakana, - WBP_ALetter, - WBP_MidNumLet, - WBP_MidLetter, - WBP_MidNum, - WBP_Numeric, - WBP_ExtendNumLet, - WBP_Regional, - WBP_Any + WBP_Undefined, + WBP_CR, + WBP_LF, + WBP_Newline, + WBP_Extend, + WBP_Format, + WBP_Katakana, + WBP_ALetter, + WBP_MidNumLet, + WBP_MidLetter, + WBP_MidNum, + WBP_Numeric, + WBP_ExtendNumLet, + WBP_Regional, + WBP_Hebrew, + WBP_Single, + WBP_Double, + WBP_Any }; /** @@ -73,7 +82,7 @@ enum WordBreakClass */ struct WordBreakProperties { - utf32_t start; /**< Starting coding point */ - utf32_t end; /**< End coding point */ - enum WordBreakClass prop; /**< The word breaking property */ + utf32_t start; /**< Starting coding point */ + utf32_t end; /**< End coding point */ + enum WordBreakClass prop; /**< The word breaking property */ };