forked from enlightenment/efl
Synced libunibreak local copy with upstream.
This fixes T805.
This commit is contained in:
parent
cc8fa1da45
commit
cff1a9a59f
|
@ -1,4 +1,5 @@
|
||||||
Wu Yongwei. Designed and implemented liblinebreak.
|
Wu Yongwei. Designed and implemented the original liblinebreak.
|
||||||
|
Current maintainer of libunibreak.
|
||||||
|
|
||||||
Nikolay Pultsin. Put forward the original requirements on liblinebreak,
|
Nikolay Pultsin. Put forward the original requirements on liblinebreak,
|
||||||
performed tests, and made a lot of suggestions on the initial versions.
|
performed tests, and made a lot of suggestions on the initial versions.
|
||||||
|
@ -6,3 +7,5 @@ performed tests, and made a lot of suggestions on the initial versions.
|
||||||
Thomas Klausner. Autoconfiscated and libtoolized liblinebreak.
|
Thomas Klausner. Autoconfiscated and libtoolized liblinebreak.
|
||||||
|
|
||||||
Tom Hacohen. Added word boundaries support.
|
Tom Hacohen. Added word boundaries support.
|
||||||
|
|
||||||
|
Petr Filipsky. Added incremental processing for line-breaking.
|
||||||
|
|
|
@ -1,3 +1,116 @@
|
||||||
|
2013-11-14 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
* src/linebreak.c: Add/update comments and doc comments.
|
||||||
|
(lb_init_breaking_class): Rename to treat_first_char.
|
||||||
|
(lb_classify_break_simple): Rename to get_lb_result_simple.
|
||||||
|
(lb_classify_break_lookup): Rename to get_lb_result_lookup.
|
||||||
|
(set_linebreaks): Remove an unused local variable.
|
||||||
|
|
||||||
|
2013-11-14 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
* src/linebreakdata.c: Regenerate from LineBreak-6.3.0.txt.
|
||||||
|
|
||||||
|
2013-11-13 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
Fix compilation problems under MSVC.
|
||||||
|
* src/linebreak.c (lb_init_breaking_class): Remove `inline'.
|
||||||
|
(lb_classify_break_simple): Ditto.
|
||||||
|
(lb_classify_break_lookup): Ditto.
|
||||||
|
(lb_classify_break_lookup): Move local variable declaration before
|
||||||
|
assertions.
|
||||||
|
|
||||||
|
2013-11-10 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
* src/Makefile.am (libunibreak_la_LDFLAGS): Set the version-info to
|
||||||
|
`2:0:1'.
|
||||||
|
|
||||||
|
2013-11-10 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
* src/linebreakdef.c: Adjust the order of code.
|
||||||
|
(lb_process_next_char): Make its return type int.
|
||||||
|
* src/linebreak.c (lb_process_next_char): Ditto.
|
||||||
|
|
||||||
|
2013-11-10 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
* src/linebreak.c: Make minor changes in doc comments, formatting,
|
||||||
|
and names.
|
||||||
|
* src/linebreakdef.c: Ditto.
|
||||||
|
|
||||||
|
2013-11-10 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
* AUTHORS: Add `Petr Filipsky'.
|
||||||
|
|
||||||
|
2013-11-10 Petr Filipsky <philodej@gmail.com>
|
||||||
|
|
||||||
|
Expose low level line-breaking API for incremental processing.
|
||||||
|
* src/linebreak.h: Add prototype declarations for
|
||||||
|
lb_init_break_context and lb_process_next_char.
|
||||||
|
(struct LineBreakContext): New struct.
|
||||||
|
* src/linebreak.h (LINEBREAK_UNDEFINED): New macro constant.
|
||||||
|
(lb_init_breaking_class): New static function.
|
||||||
|
(lb_classify_break_simple): New static function.
|
||||||
|
(lb_classify_break_lookup): New static function.
|
||||||
|
(lb_init_break_context): New function.
|
||||||
|
(lb_process_next_char): New function.
|
||||||
|
(set_linebreaks): Implement with lb_init_break_context and
|
||||||
|
lb_process_next_char.
|
||||||
|
|
||||||
|
2013-11-05 Petr Filipsky <philodej@gmail.com>
|
||||||
|
|
||||||
|
* src/wordbreakdef.h (enum WordBreakClass): Update according to
|
||||||
|
Table 3 of Unicode Standard Annex 29, Revision 23.
|
||||||
|
|
||||||
|
2013-09-30 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
Update for the libunibreak 1.1 release.
|
||||||
|
* configure.ac (AC_INIT): Change the library version to `1.1'.
|
||||||
|
* Doxyfile (PROJECT_NUMBER): Change to `1.1'.
|
||||||
|
* Makefile.am (EXTRA_DIST): Add the `tools' directory.
|
||||||
|
* NEWS: Add information about libunibreak 1.1.
|
||||||
|
* src/Makefile.am (libunibreak_la_LDFLAGS): Set the version to `1:1'.
|
||||||
|
|
||||||
|
2013-09-29 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
* src/Makefile.msvc: Modernize obsolete/deprecated MSVC options.
|
||||||
|
|
||||||
|
2013-09-28 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
* src/wordbreak.c: Update copyright year and UAX information.
|
||||||
|
* src/wordbreak.h: Ditto.
|
||||||
|
* src/wordbreakdef.h: Ditto.
|
||||||
|
|
||||||
|
2013-09-28 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
Fix the errors caused by libtool 2.4 (really annoying to the level
|
||||||
|
of WTF for making me add the foolish dependency on m4).
|
||||||
|
* Makefile.am (ACLOCAL_AMFLAGS): Add `-I m4'.
|
||||||
|
* bootstrap: Add a line to execute autoreconf.
|
||||||
|
* configure.ac (AC_CONFIG_MACRO_DIR): Set to `[m4]'.
|
||||||
|
* purge: Make it remove also the m4 directory.
|
||||||
|
|
||||||
|
2013-09-28 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
* Makefile.am (EXTRA_DIST): Add `README.md'.
|
||||||
|
|
||||||
|
2013-09-28 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
* README.md: New Markdown version of README.
|
||||||
|
* README: Remove.
|
||||||
|
|
||||||
|
2013-05-13 Tom Hacohen <tom@stosb.com>
|
||||||
|
|
||||||
|
Update files according to UAX #29-21, for Unicode 6.2.0.
|
||||||
|
* README: Update the reference to UAX #29-21.
|
||||||
|
* src/wordbreak.c (set_wordbreaks): Update for WBP_Regional.
|
||||||
|
* src/wordbreakdef.h (WBP_Regional): New enumerator for the new
|
||||||
|
property `RI' as defined in UAX #29-21.
|
||||||
|
* src/wordbreakdata.c: Regenerate from WordBreakProperty-6.2.0.txt.
|
||||||
|
|
||||||
|
2013-05-06 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
* src/Makefile.am (install-exec-hook): Make sure `--disable-static'
|
||||||
|
can work (thanks to Eugene V. Lyubimkin).
|
||||||
|
|
||||||
2012-10-06 Wu Yongwei <wuyongwei@gmail.com>
|
2012-10-06 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
Update files according to UAX #14-30, for Unicode 6.2.0.
|
Update files according to UAX #14-30, for Unicode 6.2.0.
|
||||||
|
@ -82,11 +195,12 @@
|
||||||
|
|
||||||
2012-08-11 Wu Yongwei <wuyongwei@gmail.com>
|
2012-08-11 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
Update for the libunibreak 1.0 release.
|
||||||
* configure.ac (AC_INIT): Change the library name and version to
|
* configure.ac (AC_INIT): Change the library name and version to
|
||||||
`libunibreak' and `1.0'.
|
`libunibreak' and `1.0'.
|
||||||
(AC_PROG_LN_S): New macro.
|
(AC_PROG_LN_S): New macro.
|
||||||
(AC_OUTPUT): Change to `libunibreak.pc'.
|
(AC_OUTPUT): Change to `libunibreak.pc'.
|
||||||
* Doxyfile: (PROJECT_NAME): Change to `libunibreak'.
|
* Doxyfile (PROJECT_NAME): Change to `libunibreak'.
|
||||||
(PROJECT_NUMBER): Change to `1.0'.
|
(PROJECT_NUMBER): Change to `1.0'.
|
||||||
* LICENCE: Add copyright information about Tom Hacohen.
|
* LICENCE: Add copyright information about Tom Hacohen.
|
||||||
* Makefile.am (lib_LTLIBRARIES): Change to `libunibreak.la'.
|
* Makefile.am (lib_LTLIBRARIES): Change to `libunibreak.la'.
|
||||||
|
@ -96,7 +210,7 @@
|
||||||
a symlink to libunibreak.a.
|
a symlink to libunibreak.a.
|
||||||
* Makefile.msvc: Change the library name to `libunibreak', and the
|
* Makefile.msvc: Change the library name to `libunibreak', and the
|
||||||
output library to `unibreak.lib'.
|
output library to `unibreak.lib'.
|
||||||
* NEW: Add information about libunibreak 1.0.
|
* NEWS: Add information about libunibreak 1.0.
|
||||||
* README: Change the library name, and add information about word
|
* README: Change the library name, and add information about word
|
||||||
break.
|
break.
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,10 @@
|
||||||
|
New in libunibreak 1.1
|
||||||
|
|
||||||
|
- Update the code and data to conform to Unicode 6.2.0
|
||||||
|
- Update build files to support libtool 2.4
|
||||||
|
- Adjust code structure
|
||||||
|
- Make a few bug fixes
|
||||||
|
|
||||||
New in libunibreak 1.0
|
New in libunibreak 1.0
|
||||||
|
|
||||||
- Add word breaking support
|
- Add word breaking support
|
||||||
|
|
|
@ -1,31 +1,30 @@
|
||||||
L I B U N I B R E A K
|
LIBUNIBREAK
|
||||||
=====================
|
===========
|
||||||
|
|
||||||
Overview
|
Overview
|
||||||
--------
|
--------
|
||||||
|
|
||||||
This is the README file for libunibreak, an implementation of the line
|
This is the README file for libunibreak, an implementation of the line
|
||||||
breaking and word breaking algorithms as described in Unicode
|
breaking and word breaking algorithms as described in [Unicode Standard
|
||||||
Standard Annex 14 and Unicode Standard Annex 30, available at
|
Annex 14] [1] and [Unicode Standard Annex 29] [2]. Check the project's
|
||||||
<URL:http://www.unicode.org/reports/tr14/tr14-30.html>
|
[home page] [3] for up-to-date information.
|
||||||
<URL:http://www.unicode.org/reports/tr29/tr29-17.html>
|
|
||||||
|
|
||||||
Check this URL for up-to-date information:
|
[1]: http://www.unicode.org/reports/tr14/tr14-30.html
|
||||||
<URL:https://github.com/adah1972/libunibreak>
|
[2]: http://www.unicode.org/reports/tr29/tr29-21.html
|
||||||
|
[3]: https://github.com/adah1972/libunibreak
|
||||||
|
|
||||||
|
|
||||||
Licence
|
Licence
|
||||||
-------
|
-------
|
||||||
|
|
||||||
This library is released under an open-source licence, the zlib/libpng
|
This library is released under an open-source licence, the zlib/libpng
|
||||||
licence. Please check the file LICENCE for details.
|
licence. Please check the file *LICENCE* for details.
|
||||||
|
|
||||||
Apart from using the algorithm, part of the code is derived from the
|
Apart from using the algorithm, part of the code is derived from the
|
||||||
data provided under
|
[Unicode Public Data] [4], and the [Unicode Terms of Use] [5] may apply.
|
||||||
<URL:http://www.unicode.org/Public/>
|
|
||||||
|
|
||||||
And the Unicode Terms of Use may apply:
|
[4]: http://www.unicode.org/Public/
|
||||||
<URL:http://www.unicode.org/copyright.html>
|
[5]: http://www.unicode.org/copyright.html
|
||||||
|
|
||||||
|
|
||||||
Installation
|
Installation
|
||||||
|
@ -33,56 +32,56 @@ Installation
|
||||||
|
|
||||||
There are three ways to build the library:
|
There are three ways to build the library:
|
||||||
|
|
||||||
1) On *NIX systems supported by the autoconfiscation tools, do the
|
1. On \*NIX systems supported by the autoconfiscation tools, do the
|
||||||
normal
|
normal
|
||||||
|
|
||||||
./configure
|
./configure
|
||||||
make
|
make
|
||||||
sudo make install
|
sudo make install
|
||||||
|
|
||||||
to build and install both the dynamic and static libraries. In
|
to build and install both the dynamic and static libraries. In
|
||||||
addition, one may
|
addition, one may
|
||||||
|
- type `make doc` to generate the doxygen documentation; or
|
||||||
|
- type `make linebreakdata` to regenerate *linebreakdata.c* from
|
||||||
|
*LineBreak.txt*.
|
||||||
|
- type `make wordbreakdata` to regenerate *wordbreakdata.c* from
|
||||||
|
*WordBreakProperty.txt*.
|
||||||
|
|
||||||
- type `make doc' to generate the doxygen documentation; or
|
2. On systems where GCC and Binutils are supported, one can type
|
||||||
- type `make linebreakdata' to regenerate linebreakdata.c from
|
|
||||||
LineBreak.txt.
|
|
||||||
- type `make wordbreakdata' to regenerate wordbreakdata.c from
|
|
||||||
WordBreakProperty.txt.
|
|
||||||
|
|
||||||
2) On systems where GCC and Binutils are supported, one can type
|
cd src
|
||||||
|
cp -p Makefile.gcc Makefile
|
||||||
cd src
|
make
|
||||||
cp -p Makefile.gcc Makefile
|
|
||||||
make
|
|
||||||
|
|
||||||
to build the static library. In addition, one may
|
to build the static library. In addition, one may
|
||||||
|
- type `make debug` or `make release` to explicitly generate the
|
||||||
- type `make debug' or `make release' to explicitly generate the
|
|
||||||
debug or release build;
|
debug or release build;
|
||||||
- type `make doc' to generate the doxygen documentation; or
|
- type `make doc` to generate the doxygen documentation; or
|
||||||
- type `make linebreakdata' to regenerate linebreakdata.c from
|
- type `make linebreakdata` to regenerate *linebreakdata.c* from
|
||||||
LineBreak.txt.
|
*LineBreak.txt*.
|
||||||
- type `make wordbreakdata' to regenerate wordbreakdata.c from
|
- type `make wordbreakdata` to regenerate *wordbreakdata.c* from
|
||||||
WordBreakProperty.txt.
|
*WordBreakProperty.txt*.
|
||||||
|
|
||||||
3) On Windows, apart from using method 1 (Cygwin/MSYS) and method 2
|
3. On Windows, apart from using method 1 (Cygwin/MSYS) and method 2
|
||||||
(MinGW), MSVC can also be used. Type
|
(MinGW), MSVC can also be used. Type
|
||||||
|
|
||||||
cd src
|
cd src
|
||||||
nmake -f Makefile.msvc
|
nmake -f Makefile.msvc
|
||||||
|
|
||||||
to build the static library. By default the debug release is built.
|
to build the static library. By default the debug release is built.
|
||||||
To build the release version
|
To build the release version
|
||||||
|
|
||||||
nmake -f Makefile.msvc CFG="libunibreak - Win32 Release"
|
nmake -f Makefile.msvc CFG="libunibreak - Win32 Release"
|
||||||
|
|
||||||
|
|
||||||
Documentation
|
Documentation
|
||||||
-------------
|
-------------
|
||||||
|
|
||||||
Check the generated document doc/html/linebreak_8h.html and
|
Check the generated document *doc/html/linebreak\_8h.html* and
|
||||||
doc/html/wordbreak_8h.html in the downloaded file for the public
|
*doc/html/wordbreak\_8h.html* in the downloaded file for the public
|
||||||
interfaces exposed to applications.
|
interfaces exposed to applications.
|
||||||
|
|
||||||
|
|
||||||
|
<!--
|
||||||
vim:autoindent:expandtab:formatoptions=tcqlmn:textwidth=72:
|
vim:autoindent:expandtab:formatoptions=tcqlmn:textwidth=72:
|
||||||
|
-->
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -1,4 +1,4 @@
|
||||||
/* vim: set tabstop=4 shiftwidth=4: */
|
/* vim: set expandtab tabstop=4 softtabstop=4 shiftwidth=4: */
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Line breaking in a Unicode sequence. Designed to be used in a
|
* Line breaking in a Unicode sequence. Designed to be used in a
|
||||||
|
@ -24,27 +24,27 @@
|
||||||
* distribution.
|
* distribution.
|
||||||
*
|
*
|
||||||
* The main reference is Unicode Standard Annex 14 (UAX #14):
|
* The main reference is Unicode Standard Annex 14 (UAX #14):
|
||||||
* <URL:http://www.unicode.org/reports/tr14/>
|
* <URL:http://www.unicode.org/reports/tr14/>
|
||||||
*
|
*
|
||||||
* When this library was designed, this annex was at Revision 19, for
|
* When this library was designed, this annex was at Revision 19, for
|
||||||
* Unicode 5.0.0:
|
* Unicode 5.0.0:
|
||||||
* <URL:http://www.unicode.org/reports/tr14/tr14-19.html>
|
* <URL:http://www.unicode.org/reports/tr14/tr14-19.html>
|
||||||
*
|
*
|
||||||
* This library has been updated according to Revision 30, for
|
* This library has been updated according to Revision 30, for
|
||||||
* Unicode 6.2.0:
|
* Unicode 6.2.0:
|
||||||
* <URL:http://www.unicode.org/reports/tr14/tr14-30.html>
|
* <URL:http://www.unicode.org/reports/tr14/tr14-30.html>
|
||||||
*
|
*
|
||||||
* The Unicode Terms of Use are available at
|
* The Unicode Terms of Use are available at
|
||||||
* <URL:http://www.unicode.org/copyright.html>
|
* <URL:http://www.unicode.org/copyright.html>
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @file linebreak.h
|
* @file linebreak.h
|
||||||
*
|
*
|
||||||
* Header file for the line breaking algorithm.
|
* Header file for the line breaking algorithm.
|
||||||
*
|
*
|
||||||
* @version 2.2, 2012/10/06
|
* @version 2.2, 2012/10/06
|
||||||
* @author Wu Yongwei
|
* @author Wu Yongwei
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#ifndef LINEBREAK_H
|
#ifndef LINEBREAK_H
|
||||||
|
@ -56,28 +56,28 @@
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define LINEBREAK_VERSION 0x0202 /**< Version of the library linebreak */
|
#define LINEBREAK_VERSION 0x0202 /**< Version of the library linebreak */
|
||||||
extern const int linebreak_version;
|
extern const int linebreak_version;
|
||||||
|
|
||||||
#ifndef LINEBREAK_UTF_TYPES_DEFINED
|
#ifndef LINEBREAK_UTF_TYPES_DEFINED
|
||||||
#define LINEBREAK_UTF_TYPES_DEFINED
|
#define LINEBREAK_UTF_TYPES_DEFINED
|
||||||
typedef unsigned char utf8_t; /**< Type for UTF-8 data points */
|
typedef unsigned char utf8_t; /**< Type for UTF-8 data points */
|
||||||
typedef unsigned short utf16_t; /**< Type for UTF-16 data points */
|
typedef unsigned short utf16_t; /**< Type for UTF-16 data points */
|
||||||
typedef unsigned int utf32_t; /**< Type for UTF-32 data points */
|
typedef unsigned int utf32_t; /**< Type for UTF-32 data points */
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define LINEBREAK_MUSTBREAK 0 /**< Break is mandatory */
|
#define LINEBREAK_MUSTBREAK 0 /**< Break is mandatory */
|
||||||
#define LINEBREAK_ALLOWBREAK 1 /**< Break is allowed */
|
#define LINEBREAK_ALLOWBREAK 1 /**< Break is allowed */
|
||||||
#define LINEBREAK_NOBREAK 2 /**< No break is possible */
|
#define LINEBREAK_NOBREAK 2 /**< No break is possible */
|
||||||
#define LINEBREAK_INSIDEACHAR 3 /**< A UTF-8/16 sequence is unfinished */
|
#define LINEBREAK_INSIDEACHAR 3 /**< A UTF-8/16 sequence is unfinished */
|
||||||
|
|
||||||
void init_linebreak(void);
|
void init_linebreak(void);
|
||||||
void set_linebreaks_utf8(
|
void set_linebreaks_utf8(
|
||||||
const utf8_t *s, size_t len, const char* lang, char *brks);
|
const utf8_t *s, size_t len, const char* lang, char *brks);
|
||||||
void set_linebreaks_utf16(
|
void set_linebreaks_utf16(
|
||||||
const utf16_t *s, size_t len, const char* lang, char *brks);
|
const utf16_t *s, size_t len, const char* lang, char *brks);
|
||||||
void set_linebreaks_utf32(
|
void set_linebreaks_utf32(
|
||||||
const utf32_t *s, size_t len, const char* lang, char *brks);
|
const utf32_t *s, size_t len, const char* lang, char *brks);
|
||||||
int is_line_breakable(utf32_t char1, utf32_t char2, const char* lang);
|
int is_line_breakable(utf32_t char1, utf32_t char2, const char* lang);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
/* The content of this file is generated from:
|
/* The content of this file is generated from:
|
||||||
# LineBreak-6.2.0.txt
|
# LineBreak-6.3.0.txt
|
||||||
# Date: 2012-08-08, 19:26:00 GMT [KW]
|
# Date: 2013-02-06, 19:45:00 GMT [KW, LI]
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include "linebreak.h"
|
#include "linebreak.h"
|
||||||
|
@ -114,7 +114,9 @@ struct LineBreakProperties lb_prop_default[] = {
|
||||||
{ 0x060C, 0x060D, LBP_IS },
|
{ 0x060C, 0x060D, LBP_IS },
|
||||||
{ 0x060E, 0x060F, LBP_AL },
|
{ 0x060E, 0x060F, LBP_AL },
|
||||||
{ 0x0610, 0x061A, LBP_CM },
|
{ 0x0610, 0x061A, LBP_CM },
|
||||||
{ 0x061B, 0x061F, LBP_EX },
|
{ 0x061B, 0x061B, LBP_EX },
|
||||||
|
{ 0x061C, 0x061C, LBP_CM },
|
||||||
|
{ 0x061E, 0x061F, LBP_EX },
|
||||||
{ 0x0620, 0x064A, LBP_AL },
|
{ 0x0620, 0x064A, LBP_AL },
|
||||||
{ 0x064B, 0x065F, LBP_CM },
|
{ 0x064B, 0x065F, LBP_CM },
|
||||||
{ 0x0660, 0x0669, LBP_NU },
|
{ 0x0660, 0x0669, LBP_NU },
|
||||||
|
@ -456,7 +458,7 @@ struct LineBreakProperties lb_prop_default[] = {
|
||||||
{ 0x205D, 0x205F, LBP_BA },
|
{ 0x205D, 0x205F, LBP_BA },
|
||||||
{ 0x2060, 0x2060, LBP_WJ },
|
{ 0x2060, 0x2060, LBP_WJ },
|
||||||
{ 0x2061, 0x2064, LBP_AL },
|
{ 0x2061, 0x2064, LBP_AL },
|
||||||
{ 0x206A, 0x206F, LBP_CM },
|
{ 0x2066, 0x206F, LBP_CM },
|
||||||
{ 0x2070, 0x2071, LBP_AL },
|
{ 0x2070, 0x2071, LBP_AL },
|
||||||
{ 0x2074, 0x2074, LBP_AI },
|
{ 0x2074, 0x2074, LBP_AI },
|
||||||
{ 0x2075, 0x207C, LBP_AL },
|
{ 0x2075, 0x207C, LBP_AL },
|
||||||
|
@ -473,7 +475,7 @@ struct LineBreakProperties lb_prop_default[] = {
|
||||||
{ 0x20A7, 0x20A7, LBP_PO },
|
{ 0x20A7, 0x20A7, LBP_PO },
|
||||||
{ 0x20A8, 0x20B5, LBP_PR },
|
{ 0x20A8, 0x20B5, LBP_PR },
|
||||||
{ 0x20B6, 0x20B6, LBP_PO },
|
{ 0x20B6, 0x20B6, LBP_PO },
|
||||||
{ 0x20B7, 0x20BA, LBP_PR },
|
{ 0x20B7, 0x20CF, LBP_PR },
|
||||||
{ 0x20D0, 0x20F0, LBP_CM },
|
{ 0x20D0, 0x20F0, LBP_CM },
|
||||||
{ 0x2100, 0x2102, LBP_AL },
|
{ 0x2100, 0x2102, LBP_AL },
|
||||||
{ 0x2103, 0x2103, LBP_PO },
|
{ 0x2103, 0x2103, LBP_PO },
|
||||||
|
@ -774,7 +776,8 @@ struct LineBreakProperties lb_prop_default[] = {
|
||||||
{ 0x2E33, 0x2E34, LBP_BA },
|
{ 0x2E33, 0x2E34, LBP_BA },
|
||||||
{ 0x2E35, 0x2E39, LBP_AL },
|
{ 0x2E35, 0x2E39, LBP_AL },
|
||||||
{ 0x2E3A, 0x2E3B, LBP_B2 },
|
{ 0x2E3A, 0x2E3B, LBP_B2 },
|
||||||
{ 0x2E80, 0x3000, LBP_ID },
|
{ 0x2E80, 0x2FFB, LBP_ID },
|
||||||
|
{ 0x3000, 0x3000, LBP_BA },
|
||||||
{ 0x3001, 0x3002, LBP_CL },
|
{ 0x3001, 0x3002, LBP_CL },
|
||||||
{ 0x3003, 0x3004, LBP_ID },
|
{ 0x3003, 0x3004, LBP_ID },
|
||||||
{ 0x3005, 0x3005, LBP_NS },
|
{ 0x3005, 0x3005, LBP_NS },
|
||||||
|
@ -803,7 +806,9 @@ struct LineBreakProperties lb_prop_default[] = {
|
||||||
{ 0x301E, 0x301F, LBP_CL },
|
{ 0x301E, 0x301F, LBP_CL },
|
||||||
{ 0x3020, 0x3029, LBP_ID },
|
{ 0x3020, 0x3029, LBP_ID },
|
||||||
{ 0x302A, 0x302F, LBP_CM },
|
{ 0x302A, 0x302F, LBP_CM },
|
||||||
{ 0x3030, 0x303A, LBP_ID },
|
{ 0x3030, 0x3034, LBP_ID },
|
||||||
|
{ 0x3035, 0x3035, LBP_CM },
|
||||||
|
{ 0x3036, 0x303A, LBP_ID },
|
||||||
{ 0x303B, 0x303C, LBP_NS },
|
{ 0x303B, 0x303C, LBP_NS },
|
||||||
{ 0x303D, 0x303F, LBP_ID },
|
{ 0x303D, 0x303F, LBP_ID },
|
||||||
{ 0x3041, 0x3041, LBP_CJ },
|
{ 0x3041, 0x3041, LBP_CJ },
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
/* vim: set tabstop=4 shiftwidth=4: */
|
/* vim: set expandtab tabstop=4 softtabstop=4 shiftwidth=4: */
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Line breaking in a Unicode sequence. Designed to be used in a
|
* Line breaking in a Unicode sequence. Designed to be used in a
|
||||||
|
@ -24,27 +24,27 @@
|
||||||
* distribution.
|
* distribution.
|
||||||
*
|
*
|
||||||
* The main reference is Unicode Standard Annex 14 (UAX #14):
|
* The main reference is Unicode Standard Annex 14 (UAX #14):
|
||||||
* <URL:http://www.unicode.org/reports/tr14/>
|
* <URL:http://www.unicode.org/reports/tr14/>
|
||||||
*
|
*
|
||||||
* When this library was designed, this annex was at Revision 19, for
|
* When this library was designed, this annex was at Revision 19, for
|
||||||
* Unicode 5.0.0:
|
* Unicode 5.0.0:
|
||||||
* <URL:http://www.unicode.org/reports/tr14/tr14-19.html>
|
* <URL:http://www.unicode.org/reports/tr14/tr14-19.html>
|
||||||
*
|
*
|
||||||
* This library has been updated according to Revision 30, for
|
* This library has been updated according to Revision 30, for
|
||||||
* Unicode 6.2.0:
|
* Unicode 6.2.0:
|
||||||
* <URL:http://www.unicode.org/reports/tr14/tr14-30.html>
|
* <URL:http://www.unicode.org/reports/tr14/tr14-30.html>
|
||||||
*
|
*
|
||||||
* The Unicode Terms of Use are available at
|
* The Unicode Terms of Use are available at
|
||||||
* <URL:http://www.unicode.org/copyright.html>
|
* <URL:http://www.unicode.org/copyright.html>
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @file linebreakdef.c
|
* @file linebreakdef.c
|
||||||
*
|
*
|
||||||
* Definition of language-specific data.
|
* Definition of language-specific data.
|
||||||
*
|
*
|
||||||
* @version 2.2, 2012/10/06
|
* @version 2.2, 2012/10/06
|
||||||
* @author Wu Yongwei
|
* @author Wu Yongwei
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include "linebreak.h"
|
#include "linebreak.h"
|
||||||
|
@ -54,72 +54,72 @@
|
||||||
* English-specifc data over the default Unicode rules.
|
* English-specifc data over the default Unicode rules.
|
||||||
*/
|
*/
|
||||||
static struct LineBreakProperties lb_prop_English[] = {
|
static struct LineBreakProperties lb_prop_English[] = {
|
||||||
{ 0x2018, 0x2018, LBP_OP }, /* Left single quotation mark: opening */
|
{ 0x2018, 0x2018, LBP_OP }, /* Left single quotation mark: opening */
|
||||||
{ 0x201C, 0x201C, LBP_OP }, /* Left double quotation mark: opening */
|
{ 0x201C, 0x201C, LBP_OP }, /* Left double quotation mark: opening */
|
||||||
{ 0x201D, 0x201D, LBP_CL }, /* Right double quotation mark: closing */
|
{ 0x201D, 0x201D, LBP_CL }, /* Right double quotation mark: closing */
|
||||||
{ 0, 0, LBP_Undefined }
|
{ 0, 0, LBP_Undefined }
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* German-specifc data over the default Unicode rules.
|
* German-specifc data over the default Unicode rules.
|
||||||
*/
|
*/
|
||||||
static struct LineBreakProperties lb_prop_German[] = {
|
static struct LineBreakProperties lb_prop_German[] = {
|
||||||
{ 0x00AB, 0x00AB, LBP_CL }, /* Left double angle quotation mark: closing */
|
{ 0x00AB, 0x00AB, LBP_CL }, /* Left double angle quotation mark: closing */
|
||||||
{ 0x00BB, 0x00BB, LBP_OP }, /* Right double angle quotation mark: opening */
|
{ 0x00BB, 0x00BB, LBP_OP }, /* Right double angle quotation mark: opening */
|
||||||
{ 0x2018, 0x2018, LBP_CL }, /* Left single quotation mark: closing */
|
{ 0x2018, 0x2018, LBP_CL }, /* Left single quotation mark: closing */
|
||||||
{ 0x201C, 0x201C, LBP_CL }, /* Left double quotation mark: closing */
|
{ 0x201C, 0x201C, LBP_CL }, /* Left double quotation mark: closing */
|
||||||
{ 0x2039, 0x2039, LBP_CL }, /* Left single angle quotation mark: closing */
|
{ 0x2039, 0x2039, LBP_CL }, /* Left single angle quotation mark: closing */
|
||||||
{ 0x203A, 0x203A, LBP_OP }, /* Right single angle quotation mark: opening */
|
{ 0x203A, 0x203A, LBP_OP }, /* Right single angle quotation mark: opening */
|
||||||
{ 0, 0, LBP_Undefined }
|
{ 0, 0, LBP_Undefined }
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Spanish-specifc data over the default Unicode rules.
|
* Spanish-specifc data over the default Unicode rules.
|
||||||
*/
|
*/
|
||||||
static struct LineBreakProperties lb_prop_Spanish[] = {
|
static struct LineBreakProperties lb_prop_Spanish[] = {
|
||||||
{ 0x00AB, 0x00AB, LBP_OP }, /* Left double angle quotation mark: opening */
|
{ 0x00AB, 0x00AB, LBP_OP }, /* Left double angle quotation mark: opening */
|
||||||
{ 0x00BB, 0x00BB, LBP_CL }, /* Right double angle quotation mark: closing */
|
{ 0x00BB, 0x00BB, LBP_CL }, /* Right double angle quotation mark: closing */
|
||||||
{ 0x2018, 0x2018, LBP_OP }, /* Left single quotation mark: opening */
|
{ 0x2018, 0x2018, LBP_OP }, /* Left single quotation mark: opening */
|
||||||
{ 0x201C, 0x201C, LBP_OP }, /* Left double quotation mark: opening */
|
{ 0x201C, 0x201C, LBP_OP }, /* Left double quotation mark: opening */
|
||||||
{ 0x201D, 0x201D, LBP_CL }, /* Right double quotation mark: closing */
|
{ 0x201D, 0x201D, LBP_CL }, /* Right double quotation mark: closing */
|
||||||
{ 0x2039, 0x2039, LBP_OP }, /* Left single angle quotation mark: opening */
|
{ 0x2039, 0x2039, LBP_OP }, /* Left single angle quotation mark: opening */
|
||||||
{ 0x203A, 0x203A, LBP_CL }, /* Right single angle quotation mark: closing */
|
{ 0x203A, 0x203A, LBP_CL }, /* Right single angle quotation mark: closing */
|
||||||
{ 0, 0, LBP_Undefined }
|
{ 0, 0, LBP_Undefined }
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* French-specifc data over the default Unicode rules.
|
* French-specifc data over the default Unicode rules.
|
||||||
*/
|
*/
|
||||||
static struct LineBreakProperties lb_prop_French[] = {
|
static struct LineBreakProperties lb_prop_French[] = {
|
||||||
{ 0x00AB, 0x00AB, LBP_OP }, /* Left double angle quotation mark: opening */
|
{ 0x00AB, 0x00AB, LBP_OP }, /* Left double angle quotation mark: opening */
|
||||||
{ 0x00BB, 0x00BB, LBP_CL }, /* Right double angle quotation mark: closing */
|
{ 0x00BB, 0x00BB, LBP_CL }, /* Right double angle quotation mark: closing */
|
||||||
{ 0x2018, 0x2018, LBP_OP }, /* Left single quotation mark: opening */
|
{ 0x2018, 0x2018, LBP_OP }, /* Left single quotation mark: opening */
|
||||||
{ 0x201C, 0x201C, LBP_OP }, /* Left double quotation mark: opening */
|
{ 0x201C, 0x201C, LBP_OP }, /* Left double quotation mark: opening */
|
||||||
{ 0x201D, 0x201D, LBP_CL }, /* Right double quotation mark: closing */
|
{ 0x201D, 0x201D, LBP_CL }, /* Right double quotation mark: closing */
|
||||||
{ 0x2039, 0x2039, LBP_OP }, /* Left single angle quotation mark: opening */
|
{ 0x2039, 0x2039, LBP_OP }, /* Left single angle quotation mark: opening */
|
||||||
{ 0x203A, 0x203A, LBP_CL }, /* Right single angle quotation mark: closing */
|
{ 0x203A, 0x203A, LBP_CL }, /* Right single angle quotation mark: closing */
|
||||||
{ 0, 0, LBP_Undefined }
|
{ 0, 0, LBP_Undefined }
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Russian-specifc data over the default Unicode rules.
|
* Russian-specifc data over the default Unicode rules.
|
||||||
*/
|
*/
|
||||||
static struct LineBreakProperties lb_prop_Russian[] = {
|
static struct LineBreakProperties lb_prop_Russian[] = {
|
||||||
{ 0x00AB, 0x00AB, LBP_OP }, /* Left double angle quotation mark: opening */
|
{ 0x00AB, 0x00AB, LBP_OP }, /* Left double angle quotation mark: opening */
|
||||||
{ 0x00BB, 0x00BB, LBP_CL }, /* Right double angle quotation mark: closing */
|
{ 0x00BB, 0x00BB, LBP_CL }, /* Right double angle quotation mark: closing */
|
||||||
{ 0x201C, 0x201C, LBP_CL }, /* Left double quotation mark: closing */
|
{ 0x201C, 0x201C, LBP_CL }, /* Left double quotation mark: closing */
|
||||||
{ 0, 0, LBP_Undefined }
|
{ 0, 0, LBP_Undefined }
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Chinese-specifc data over the default Unicode rules.
|
* Chinese-specifc data over the default Unicode rules.
|
||||||
*/
|
*/
|
||||||
static struct LineBreakProperties lb_prop_Chinese[] = {
|
static struct LineBreakProperties lb_prop_Chinese[] = {
|
||||||
{ 0x2018, 0x2018, LBP_OP }, /* Left single quotation mark: opening */
|
{ 0x2018, 0x2018, LBP_OP }, /* Left single quotation mark: opening */
|
||||||
{ 0x2019, 0x2019, LBP_CL }, /* Right single quotation mark: closing */
|
{ 0x2019, 0x2019, LBP_CL }, /* Right single quotation mark: closing */
|
||||||
{ 0x201C, 0x201C, LBP_OP }, /* Left double quotation mark: opening */
|
{ 0x201C, 0x201C, LBP_OP }, /* Left double quotation mark: opening */
|
||||||
{ 0x201D, 0x201D, LBP_CL }, /* Right double quotation mark: closing */
|
{ 0x201D, 0x201D, LBP_CL }, /* Right double quotation mark: closing */
|
||||||
{ 0, 0, LBP_Undefined }
|
{ 0, 0, LBP_Undefined }
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -129,11 +129,11 @@ static struct LineBreakProperties lb_prop_Chinese[] = {
|
||||||
* you may want to redefine \e lb_prop_lang_map in your C source file.
|
* you may want to redefine \e lb_prop_lang_map in your C source file.
|
||||||
*/
|
*/
|
||||||
struct LineBreakPropertiesLang lb_prop_lang_map[] = {
|
struct LineBreakPropertiesLang lb_prop_lang_map[] = {
|
||||||
{ "en", 2, lb_prop_English },
|
{ "en", 2, lb_prop_English },
|
||||||
{ "de", 2, lb_prop_German },
|
{ "de", 2, lb_prop_German },
|
||||||
{ "es", 2, lb_prop_Spanish },
|
{ "es", 2, lb_prop_Spanish },
|
||||||
{ "fr", 2, lb_prop_French },
|
{ "fr", 2, lb_prop_French },
|
||||||
{ "ru", 2, lb_prop_Russian },
|
{ "ru", 2, lb_prop_Russian },
|
||||||
{ "zh", 2, lb_prop_Chinese },
|
{ "zh", 2, lb_prop_Chinese },
|
||||||
{ NULL, 0, NULL }
|
{ NULL, 0, NULL }
|
||||||
};
|
};
|
||||||
|
|
|
@ -1,10 +1,11 @@
|
||||||
/* vim: set tabstop=4 shiftwidth=4: */
|
/* vim: set expandtab tabstop=4 softtabstop=4 shiftwidth=4: */
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Line breaking in a Unicode sequence. Designed to be used in a
|
* Line breaking in a Unicode sequence. Designed to be used in a
|
||||||
* generic text renderer.
|
* generic text renderer.
|
||||||
*
|
*
|
||||||
* Copyright (C) 2008-2012 Wu Yongwei <wuyongwei at gmail dot com>
|
* Copyright (C) 2008-2013 Wu Yongwei <wuyongwei at gmail dot com>
|
||||||
|
* Copyright (C) 2013 Petr Filipsky <philodej at gmail dot com>
|
||||||
*
|
*
|
||||||
* This software is provided 'as-is', without any express or implied
|
* This software is provided 'as-is', without any express or implied
|
||||||
* warranty. In no event will the author be held liable for any damages
|
* warranty. In no event will the author be held liable for any damages
|
||||||
|
@ -24,35 +25,36 @@
|
||||||
* distribution.
|
* distribution.
|
||||||
*
|
*
|
||||||
* The main reference is Unicode Standard Annex 14 (UAX #14):
|
* The main reference is Unicode Standard Annex 14 (UAX #14):
|
||||||
* <URL:http://www.unicode.org/reports/tr14/>
|
* <URL:http://www.unicode.org/reports/tr14/>
|
||||||
*
|
*
|
||||||
* When this library was designed, this annex was at Revision 19, for
|
* When this library was designed, this annex was at Revision 19, for
|
||||||
* Unicode 5.0.0:
|
* Unicode 5.0.0:
|
||||||
* <URL:http://www.unicode.org/reports/tr14/tr14-19.html>
|
* <URL:http://www.unicode.org/reports/tr14/tr14-19.html>
|
||||||
*
|
*
|
||||||
* This library has been updated according to Revision 30, for
|
* This library has been updated according to Revision 30, for
|
||||||
* Unicode 6.2.0:
|
* Unicode 6.2.0:
|
||||||
* <URL:http://www.unicode.org/reports/tr14/tr14-30.html>
|
* <URL:http://www.unicode.org/reports/tr14/tr14-30.html>
|
||||||
*
|
*
|
||||||
* The Unicode Terms of Use are available at
|
* The Unicode Terms of Use are available at
|
||||||
* <URL:http://www.unicode.org/copyright.html>
|
* <URL:http://www.unicode.org/copyright.html>
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @file linebreakdef.h
|
* @file linebreakdef.h
|
||||||
*
|
*
|
||||||
* Definitions of internal data structures, declarations of global
|
* Definitions of internal data structures, declarations of global
|
||||||
* variables, and function prototypes for the line breaking algorithm.
|
* variables, and function prototypes for the line breaking algorithm.
|
||||||
*
|
*
|
||||||
* @version 2.3, 2012/10/06
|
* @version 2.4, 2013/11/10
|
||||||
* @author Wu Yongwei
|
* @author Wu Yongwei
|
||||||
|
* @author Petr Filipsky
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Constant value to mark the end of string. It is not a valid Unicode
|
* Constant value to mark the end of string. It is not a valid Unicode
|
||||||
* character.
|
* character.
|
||||||
*/
|
*/
|
||||||
#define EOS 0xFFFF
|
#define EOS 0xFFFFFFFF
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Line break classes. This is a direct mapping of Table 1 of Unicode
|
* Line break classes. This is a direct mapping of Table 1 of Unicode
|
||||||
|
@ -60,52 +62,52 @@
|
||||||
*/
|
*/
|
||||||
enum LineBreakClass
|
enum LineBreakClass
|
||||||
{
|
{
|
||||||
/* This is used to signal an error condition. */
|
/* This is used to signal an error condition. */
|
||||||
LBP_Undefined, /**< Undefined */
|
LBP_Undefined, /**< Undefined */
|
||||||
|
|
||||||
/* The following break classes are treated in the pair table. */
|
/* The following break classes are treated in the pair table. */
|
||||||
LBP_OP, /**< Opening punctuation */
|
LBP_OP, /**< Opening punctuation */
|
||||||
LBP_CL, /**< Closing punctuation */
|
LBP_CL, /**< Closing punctuation */
|
||||||
LBP_CP, /**< Closing parenthesis */
|
LBP_CP, /**< Closing parenthesis */
|
||||||
LBP_QU, /**< Ambiguous quotation */
|
LBP_QU, /**< Ambiguous quotation */
|
||||||
LBP_GL, /**< Glue */
|
LBP_GL, /**< Glue */
|
||||||
LBP_NS, /**< Non-starters */
|
LBP_NS, /**< Non-starters */
|
||||||
LBP_EX, /**< Exclamation/Interrogation */
|
LBP_EX, /**< Exclamation/Interrogation */
|
||||||
LBP_SY, /**< Symbols allowing break after */
|
LBP_SY, /**< Symbols allowing break after */
|
||||||
LBP_IS, /**< Infix separator */
|
LBP_IS, /**< Infix separator */
|
||||||
LBP_PR, /**< Prefix */
|
LBP_PR, /**< Prefix */
|
||||||
LBP_PO, /**< Postfix */
|
LBP_PO, /**< Postfix */
|
||||||
LBP_NU, /**< Numeric */
|
LBP_NU, /**< Numeric */
|
||||||
LBP_AL, /**< Alphabetic */
|
LBP_AL, /**< Alphabetic */
|
||||||
LBP_HL, /**< Hebrew letter */
|
LBP_HL, /**< Hebrew letter */
|
||||||
LBP_ID, /**< Ideographic */
|
LBP_ID, /**< Ideographic */
|
||||||
LBP_IN, /**< Inseparable characters */
|
LBP_IN, /**< Inseparable characters */
|
||||||
LBP_HY, /**< Hyphen */
|
LBP_HY, /**< Hyphen */
|
||||||
LBP_BA, /**< Break after */
|
LBP_BA, /**< Break after */
|
||||||
LBP_BB, /**< Break before */
|
LBP_BB, /**< Break before */
|
||||||
LBP_B2, /**< Break on either side (but not pair) */
|
LBP_B2, /**< Break on either side (but not pair) */
|
||||||
LBP_ZW, /**< Zero-width space */
|
LBP_ZW, /**< Zero-width space */
|
||||||
LBP_CM, /**< Combining marks */
|
LBP_CM, /**< Combining marks */
|
||||||
LBP_WJ, /**< Word joiner */
|
LBP_WJ, /**< Word joiner */
|
||||||
LBP_H2, /**< Hangul LV */
|
LBP_H2, /**< Hangul LV */
|
||||||
LBP_H3, /**< Hangul LVT */
|
LBP_H3, /**< Hangul LVT */
|
||||||
LBP_JL, /**< Hangul L Jamo */
|
LBP_JL, /**< Hangul L Jamo */
|
||||||
LBP_JV, /**< Hangul V Jamo */
|
LBP_JV, /**< Hangul V Jamo */
|
||||||
LBP_JT, /**< Hangul T Jamo */
|
LBP_JT, /**< Hangul T Jamo */
|
||||||
LBP_RI, /**< Regional indicator */
|
LBP_RI, /**< Regional indicator */
|
||||||
|
|
||||||
/* The following break classes are not treated in the pair table */
|
/* The following break classes are not treated in the pair table */
|
||||||
LBP_AI, /**< Ambiguous (alphabetic or ideograph) */
|
LBP_AI, /**< Ambiguous (alphabetic or ideograph) */
|
||||||
LBP_BK, /**< Break (mandatory) */
|
LBP_BK, /**< Break (mandatory) */
|
||||||
LBP_CB, /**< Contingent break */
|
LBP_CB, /**< Contingent break */
|
||||||
LBP_CJ, /**< Conditional Japanese starter */
|
LBP_CJ, /**< Conditional Japanese starter */
|
||||||
LBP_CR, /**< Carriage return */
|
LBP_CR, /**< Carriage return */
|
||||||
LBP_LF, /**< Line feed */
|
LBP_LF, /**< Line feed */
|
||||||
LBP_NL, /**< Next line */
|
LBP_NL, /**< Next line */
|
||||||
LBP_SA, /**< South-East Asian */
|
LBP_SA, /**< South-East Asian */
|
||||||
LBP_SG, /**< Surrogates */
|
LBP_SG, /**< Surrogates */
|
||||||
LBP_SP, /**< Space */
|
LBP_SP, /**< Space */
|
||||||
LBP_XX /**< Unknown */
|
LBP_XX /**< Unknown */
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -114,9 +116,9 @@ enum LineBreakClass
|
||||||
*/
|
*/
|
||||||
struct LineBreakProperties
|
struct LineBreakProperties
|
||||||
{
|
{
|
||||||
utf32_t start; /**< Starting coding point */
|
utf32_t start; /**< Starting coding point */
|
||||||
utf32_t end; /**< End coding point */
|
utf32_t end; /**< End coding point */
|
||||||
enum LineBreakClass prop; /**< The line breaking property */
|
enum LineBreakClass prop; /**< The line breaking property */
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -125,9 +127,22 @@ struct LineBreakProperties
|
||||||
*/
|
*/
|
||||||
struct LineBreakPropertiesLang
|
struct LineBreakPropertiesLang
|
||||||
{
|
{
|
||||||
const char *lang; /**< Language name */
|
const char *lang; /**< Language name */
|
||||||
size_t namelen; /**< Length of name to match */
|
size_t namelen; /**< Length of name to match */
|
||||||
struct LineBreakProperties *lbp; /**< Pointer to associated data */
|
struct LineBreakProperties *lbp; /**< Pointer to associated data */
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Context representing internal state of the line breaking algorithm.
|
||||||
|
* This is useful to callers if incremental analysis is wanted.
|
||||||
|
*/
|
||||||
|
struct LineBreakContext
|
||||||
|
{
|
||||||
|
const char *lang; /**< Language name */
|
||||||
|
struct LineBreakProperties *lbpLang;/**< Pointer to LineBreakProperties */
|
||||||
|
enum LineBreakClass lbcCur; /**< Breaking class of current codepoint */
|
||||||
|
enum LineBreakClass lbcNew; /**< Breaking class of next codepoint */
|
||||||
|
enum LineBreakClass lbcLast; /**< Breaking class of last codepoint */
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -144,9 +159,16 @@ extern struct LineBreakPropertiesLang lb_prop_lang_map[];
|
||||||
utf32_t lb_get_next_char_utf8(const utf8_t *s, size_t len, size_t *ip);
|
utf32_t lb_get_next_char_utf8(const utf8_t *s, size_t len, size_t *ip);
|
||||||
utf32_t lb_get_next_char_utf16(const utf16_t *s, size_t len, size_t *ip);
|
utf32_t lb_get_next_char_utf16(const utf16_t *s, size_t len, size_t *ip);
|
||||||
utf32_t lb_get_next_char_utf32(const utf32_t *s, size_t len, size_t *ip);
|
utf32_t lb_get_next_char_utf32(const utf32_t *s, size_t len, size_t *ip);
|
||||||
|
void lb_init_break_context(
|
||||||
|
struct LineBreakContext* lbpCtx,
|
||||||
|
utf32_t ch,
|
||||||
|
const char* lang);
|
||||||
|
int lb_process_next_char(
|
||||||
|
struct LineBreakContext* lbpCtx,
|
||||||
|
utf32_t ch);
|
||||||
void set_linebreaks(
|
void set_linebreaks(
|
||||||
const void *s,
|
const void *s,
|
||||||
size_t len,
|
size_t len,
|
||||||
const char *lang,
|
const char *lang,
|
||||||
char *brks,
|
char *brks,
|
||||||
get_next_char_t get_next_char);
|
get_next_char_t get_next_char);
|
||||||
|
|
|
@ -1,10 +1,10 @@
|
||||||
/* vim: set tabstop=4 shiftwidth=4: */
|
/* vim: set expandtab tabstop=4 softtabstop=4 shiftwidth=4: */
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Word breaking in a Unicode sequence. Designed to be used in a
|
* Word breaking in a Unicode sequence. Designed to be used in a
|
||||||
* generic text renderer.
|
* generic text renderer.
|
||||||
*
|
*
|
||||||
* Copyright (C) 2012 Tom Hacohen <tom@stosb.com>
|
* Copyright (C) 2013 Tom Hacohen <tom at stosb dot com>
|
||||||
*
|
*
|
||||||
* This software is provided 'as-is', without any express or implied
|
* This software is provided 'as-is', without any express or implied
|
||||||
* warranty. In no event will the author be held liable for any damages
|
* warranty. In no event will the author be held liable for any damages
|
||||||
|
@ -24,24 +24,28 @@
|
||||||
* distribution.
|
* distribution.
|
||||||
*
|
*
|
||||||
* The main reference is Unicode Standard Annex 29 (UAX #29):
|
* The main reference is Unicode Standard Annex 29 (UAX #29):
|
||||||
* <URL:http://unicode.org/reports/tr29>
|
* <URL:http://unicode.org/reports/tr29>
|
||||||
*
|
*
|
||||||
* When this library was designed, this annex was at Revision 17, for
|
* When this library was designed, this annex was at Revision 17, for
|
||||||
* Unicode 6.0.0:
|
* Unicode 6.0.0:
|
||||||
* <URL:http://www.unicode.org/reports/tr29/tr29-17.html>
|
* <URL:http://www.unicode.org/reports/tr29/tr29-17.html>
|
||||||
|
*
|
||||||
|
* This library has been updated according to Revision 21, for
|
||||||
|
* Unicode 6.2.0:
|
||||||
|
* <URL:http://www.unicode.org/reports/tr29/tr29-21.html>
|
||||||
*
|
*
|
||||||
* The Unicode Terms of Use are available at
|
* The Unicode Terms of Use are available at
|
||||||
* <URL:http://www.unicode.org/copyright.html>
|
* <URL:http://www.unicode.org/copyright.html>
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @file wordbreak.c
|
* @file wordbreak.c
|
||||||
*
|
*
|
||||||
* Implementation of the word breaking algorithm as described in Unicode
|
* Implementation of the word breaking algorithm as described in Unicode
|
||||||
* Standard Annex 29.
|
* Standard Annex 29.
|
||||||
*
|
*
|
||||||
* @version 2.3, 2013/05/14
|
* @version 2.4, 2013/09/28
|
||||||
* @author Tom Hacohen
|
* @author Tom Hacohen
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
|
@ -66,34 +70,34 @@ void init_wordbreak(void)
|
||||||
/**
|
/**
|
||||||
* Gets the word breaking class of a character.
|
* Gets the word breaking class of a character.
|
||||||
*
|
*
|
||||||
* @param ch character to check
|
* @param ch character to check
|
||||||
* @param wbp pointer to the wbp breaking properties array
|
* @param wbp pointer to the wbp breaking properties array
|
||||||
* @param len size of the wbp array in number of items
|
* @param len size of the wbp array in number of items
|
||||||
* @return the word breaking class if found; \c WBP_Any otherwise
|
* @return the word breaking class if found; \c WBP_Any otherwise
|
||||||
*/
|
*/
|
||||||
static enum WordBreakClass get_char_wb_class(
|
static enum WordBreakClass get_char_wb_class(
|
||||||
utf32_t ch,
|
utf32_t ch,
|
||||||
struct WordBreakProperties *wbp,
|
struct WordBreakProperties *wbp,
|
||||||
size_t len)
|
size_t len)
|
||||||
{
|
{
|
||||||
int min = 0;
|
int min = 0;
|
||||||
int max = len - 1;
|
int max = len - 1;
|
||||||
int mid;
|
int mid;
|
||||||
|
|
||||||
do
|
do
|
||||||
{
|
{
|
||||||
mid = (min + max) / 2;
|
mid = (min + max) / 2;
|
||||||
|
|
||||||
if (ch < wbp[mid].start)
|
if (ch < wbp[mid].start)
|
||||||
max = mid - 1;
|
max = mid - 1;
|
||||||
else if (ch > wbp[mid].end)
|
else if (ch > wbp[mid].end)
|
||||||
min = mid + 1;
|
min = mid + 1;
|
||||||
else
|
else
|
||||||
return wbp[mid].prop;
|
return wbp[mid].prop;
|
||||||
}
|
}
|
||||||
while (min <= max);
|
while (min <= max);
|
||||||
|
|
||||||
return WBP_Any;
|
return WBP_Any;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -103,346 +107,346 @@ static enum WordBreakClass get_char_wb_class(
|
||||||
* Assumes \a brks is initialized - all the cells with #WORDBREAK_NOBREAK are
|
* Assumes \a brks is initialized - all the cells with #WORDBREAK_NOBREAK are
|
||||||
* cells that we really don't want to break after.
|
* cells that we really don't want to break after.
|
||||||
*
|
*
|
||||||
* @param[in] s input string
|
* @param[in] s input string
|
||||||
* @param[out] brks breaks array to fill
|
* @param[out] brks breaks array to fill
|
||||||
* @param[in] posStart start position
|
* @param[in] posStart start position
|
||||||
* @param[in] posEnd end position (exclusive)
|
* @param[in] posEnd end position (exclusive)
|
||||||
* @param[in] len length of the string
|
* @param[in] len length of the string
|
||||||
* @param[in] brkType breaks type to use
|
* @param[in] brkType breaks type to use
|
||||||
* @param[in] get_next_char function to get the next UTF-32 character
|
* @param[in] get_next_char function to get the next UTF-32 character
|
||||||
*/
|
*/
|
||||||
static void set_brks_to(
|
static void set_brks_to(
|
||||||
const void *s,
|
const void *s,
|
||||||
char *brks,
|
char *brks,
|
||||||
size_t posStart,
|
size_t posStart,
|
||||||
size_t posEnd,
|
size_t posEnd,
|
||||||
size_t len,
|
size_t len,
|
||||||
char brkType,
|
char brkType,
|
||||||
get_next_char_t get_next_char)
|
get_next_char_t get_next_char)
|
||||||
{
|
{
|
||||||
size_t posNext = posStart;
|
size_t posNext = posStart;
|
||||||
while (posNext < posEnd)
|
while (posNext < posEnd)
|
||||||
{
|
{
|
||||||
utf32_t ch;
|
utf32_t ch;
|
||||||
ch = get_next_char(s, len, &posNext);
|
ch = get_next_char(s, len, &posNext);
|
||||||
assert(ch != EOS);
|
assert(ch != EOS);
|
||||||
for (; posStart < posNext - 1; ++posStart)
|
for (; posStart < posNext - 1; ++posStart)
|
||||||
brks[posStart] = WORDBREAK_INSIDEACHAR;
|
brks[posStart] = WORDBREAK_INSIDEACHAR;
|
||||||
assert(posStart == posNext - 1);
|
assert(posStart == posNext - 1);
|
||||||
|
|
||||||
/* Only set it if we haven't set it not to break before. */
|
/* Only set it if we haven't set it not to break before. */
|
||||||
if (brks[posStart] != WORDBREAK_NOBREAK)
|
if (brks[posStart] != WORDBREAK_NOBREAK)
|
||||||
brks[posStart] = brkType;
|
brks[posStart] = brkType;
|
||||||
posStart = posNext;
|
posStart = posNext;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Checks to see if the class is newline, CR, or LF (rules WB3a and b). */
|
/* Checks to see if the class is newline, CR, or LF (rules WB3a and b). */
|
||||||
#define IS_WB3ab(cls) ((cls == WBP_Newline) || (cls == WBP_CR) || \
|
#define IS_WB3ab(cls) ((cls == WBP_Newline) || (cls == WBP_CR) || \
|
||||||
(cls == WBP_LF))
|
(cls == WBP_LF))
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Sets the word breaking information for a generic input string.
|
* Sets the word breaking information for a generic input string.
|
||||||
*
|
*
|
||||||
* @param[in] s input string
|
* @param[in] s input string
|
||||||
* @param[in] len length of the input
|
* @param[in] len length of the input
|
||||||
* @param[in] lang language of the input
|
* @param[in] lang language of the input
|
||||||
* @param[out] brks pointer to the output breaking data, containing
|
* @param[out] brks pointer to the output breaking data, containing
|
||||||
* #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
|
* #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
|
||||||
* #WORDBREAK_INSIDEACHAR
|
* #WORDBREAK_INSIDEACHAR
|
||||||
* @param[in] get_next_char function to get the next UTF-32 character
|
* @param[in] get_next_char function to get the next UTF-32 character
|
||||||
*/
|
*/
|
||||||
static void set_wordbreaks(
|
static void set_wordbreaks(
|
||||||
const void *s,
|
const void *s,
|
||||||
size_t len,
|
size_t len,
|
||||||
const char *lang,
|
const char *lang,
|
||||||
char *brks,
|
char *brks,
|
||||||
get_next_char_t get_next_char)
|
get_next_char_t get_next_char)
|
||||||
{
|
{
|
||||||
enum WordBreakClass wbcLast = WBP_Undefined;
|
enum WordBreakClass wbcLast = WBP_Undefined;
|
||||||
/* wbcSeqStart is the class that started the current sequence.
|
/* wbcSeqStart is the class that started the current sequence.
|
||||||
* WBP_Undefined is a special case that means "sot".
|
* WBP_Undefined is a special case that means "sot".
|
||||||
* This value is the class that is at the start of the current rule
|
* This value is the class that is at the start of the current rule
|
||||||
* matching sequence. For example, in case of Numeric+MidNum+Numeric
|
* matching sequence. For example, in case of Numeric+MidNum+Numeric
|
||||||
* it'll be Numeric all the way.
|
* it'll be Numeric all the way.
|
||||||
*/
|
*/
|
||||||
enum WordBreakClass wbcSeqStart = WBP_Undefined;
|
enum WordBreakClass wbcSeqStart = WBP_Undefined;
|
||||||
utf32_t ch;
|
utf32_t ch;
|
||||||
size_t posNext = 0;
|
size_t posNext = 0;
|
||||||
size_t posCur = 0;
|
size_t posCur = 0;
|
||||||
size_t posLast = 0;
|
size_t posLast = 0;
|
||||||
|
|
||||||
/* TODO: Language-specific specialization. */
|
/* TODO: Language-specific specialization. */
|
||||||
(void) lang;
|
(void) lang;
|
||||||
|
|
||||||
/* Init brks. */
|
/* Init brks. */
|
||||||
memset(brks, WORDBREAK_BREAK, len);
|
memset(brks, WORDBREAK_BREAK, len);
|
||||||
|
|
||||||
ch = get_next_char(s, len, &posNext);
|
ch = get_next_char(s, len, &posNext);
|
||||||
|
|
||||||
while (ch != EOS)
|
while (ch != EOS)
|
||||||
{
|
{
|
||||||
enum WordBreakClass wbcCur;
|
enum WordBreakClass wbcCur;
|
||||||
wbcCur = get_char_wb_class(ch, wb_prop_default,
|
wbcCur = get_char_wb_class(ch, wb_prop_default,
|
||||||
ARRAY_LEN(wb_prop_default));
|
ARRAY_LEN(wb_prop_default));
|
||||||
|
|
||||||
switch (wbcCur)
|
switch (wbcCur)
|
||||||
{
|
{
|
||||||
case WBP_CR:
|
case WBP_CR:
|
||||||
/* WB3b */
|
/* WB3b */
|
||||||
set_brks_to(s, brks, posLast, posCur, len,
|
set_brks_to(s, brks, posLast, posCur, len,
|
||||||
WORDBREAK_BREAK, get_next_char);
|
WORDBREAK_BREAK, get_next_char);
|
||||||
wbcSeqStart = wbcCur;
|
wbcSeqStart = wbcCur;
|
||||||
posLast = posCur;
|
posLast = posCur;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case WBP_LF:
|
case WBP_LF:
|
||||||
if (wbcSeqStart == WBP_CR) /* WB3 */
|
if (wbcSeqStart == WBP_CR) /* WB3 */
|
||||||
{
|
{
|
||||||
set_brks_to(s, brks, posLast, posCur, len,
|
set_brks_to(s, brks, posLast, posCur, len,
|
||||||
WORDBREAK_NOBREAK, get_next_char);
|
WORDBREAK_NOBREAK, get_next_char);
|
||||||
wbcSeqStart = wbcCur;
|
wbcSeqStart = wbcCur;
|
||||||
posLast = posCur;
|
posLast = posCur;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
/* Fall off */
|
/* Fall off */
|
||||||
|
|
||||||
case WBP_Newline:
|
case WBP_Newline:
|
||||||
/* WB3a,3b */
|
/* WB3a,3b */
|
||||||
set_brks_to(s, brks, posLast, posCur, len,
|
set_brks_to(s, brks, posLast, posCur, len,
|
||||||
WORDBREAK_BREAK, get_next_char);
|
WORDBREAK_BREAK, get_next_char);
|
||||||
wbcSeqStart = wbcCur;
|
wbcSeqStart = wbcCur;
|
||||||
posLast = posCur;
|
posLast = posCur;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case WBP_Extend:
|
case WBP_Extend:
|
||||||
case WBP_Format:
|
case WBP_Format:
|
||||||
/* WB4 - If not the first char/after a newline (WB3a,3b), skip
|
/* WB4 - If not the first char/after a newline (WB3a,3b), skip
|
||||||
* this class, set it to be the same as the prev, and mark
|
* this class, set it to be the same as the prev, and mark
|
||||||
* brks not to break before them. */
|
* brks not to break before them. */
|
||||||
if ((wbcSeqStart == WBP_Undefined) || IS_WB3ab(wbcSeqStart))
|
if ((wbcSeqStart == WBP_Undefined) || IS_WB3ab(wbcSeqStart))
|
||||||
{
|
{
|
||||||
set_brks_to(s, brks, posLast, posCur, len,
|
set_brks_to(s, brks, posLast, posCur, len,
|
||||||
WORDBREAK_BREAK, get_next_char);
|
WORDBREAK_BREAK, get_next_char);
|
||||||
wbcSeqStart = wbcCur;
|
wbcSeqStart = wbcCur;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
/* It's surely not the first */
|
/* It's surely not the first */
|
||||||
brks[posCur - 1] = WORDBREAK_NOBREAK;
|
brks[posCur - 1] = WORDBREAK_NOBREAK;
|
||||||
/* "inherit" the previous class. */
|
/* "inherit" the previous class. */
|
||||||
wbcCur = wbcLast;
|
wbcCur = wbcLast;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case WBP_Katakana:
|
case WBP_Katakana:
|
||||||
if ((wbcSeqStart == WBP_Katakana) || /* WB13 */
|
if ((wbcSeqStart == WBP_Katakana) || /* WB13 */
|
||||||
(wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */
|
(wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */
|
||||||
{
|
{
|
||||||
set_brks_to(s, brks, posLast, posCur, len,
|
set_brks_to(s, brks, posLast, posCur, len,
|
||||||
WORDBREAK_NOBREAK, get_next_char);
|
WORDBREAK_NOBREAK, get_next_char);
|
||||||
}
|
}
|
||||||
/* No rule found, reset */
|
/* No rule found, reset */
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
set_brks_to(s, brks, posLast, posCur, len,
|
set_brks_to(s, brks, posLast, posCur, len,
|
||||||
WORDBREAK_BREAK, get_next_char);
|
WORDBREAK_BREAK, get_next_char);
|
||||||
}
|
}
|
||||||
wbcSeqStart = wbcCur;
|
wbcSeqStart = wbcCur;
|
||||||
posLast = posCur;
|
posLast = posCur;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case WBP_ALetter:
|
case WBP_ALetter:
|
||||||
if ((wbcSeqStart == WBP_ALetter) || /* WB5,6,7 */
|
if ((wbcSeqStart == WBP_ALetter) || /* WB5,6,7 */
|
||||||
(wbcLast == WBP_Numeric) || /* WB10 */
|
(wbcLast == WBP_Numeric) || /* WB10 */
|
||||||
(wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */
|
(wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */
|
||||||
{
|
{
|
||||||
set_brks_to(s, brks, posLast, posCur, len,
|
set_brks_to(s, brks, posLast, posCur, len,
|
||||||
WORDBREAK_NOBREAK, get_next_char);
|
WORDBREAK_NOBREAK, get_next_char);
|
||||||
}
|
}
|
||||||
/* No rule found, reset */
|
/* No rule found, reset */
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
set_brks_to(s, brks, posLast, posCur, len,
|
set_brks_to(s, brks, posLast, posCur, len,
|
||||||
WORDBREAK_BREAK, get_next_char);
|
WORDBREAK_BREAK, get_next_char);
|
||||||
}
|
}
|
||||||
wbcSeqStart = wbcCur;
|
wbcSeqStart = wbcCur;
|
||||||
posLast = posCur;
|
posLast = posCur;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case WBP_MidNumLet:
|
case WBP_MidNumLet:
|
||||||
if ((wbcLast == WBP_ALetter) || /* WB6,7 */
|
if ((wbcLast == WBP_ALetter) || /* WB6,7 */
|
||||||
(wbcLast == WBP_Numeric)) /* WB11,12 */
|
(wbcLast == WBP_Numeric)) /* WB11,12 */
|
||||||
{
|
{
|
||||||
/* Go on */
|
/* Go on */
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
set_brks_to(s, brks, posLast, posCur, len,
|
set_brks_to(s, brks, posLast, posCur, len,
|
||||||
WORDBREAK_BREAK, get_next_char);
|
WORDBREAK_BREAK, get_next_char);
|
||||||
wbcSeqStart = wbcCur;
|
wbcSeqStart = wbcCur;
|
||||||
posLast = posCur;
|
posLast = posCur;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case WBP_MidLetter:
|
case WBP_MidLetter:
|
||||||
if (wbcLast == WBP_ALetter) /* WB6,7 */
|
if (wbcLast == WBP_ALetter) /* WB6,7 */
|
||||||
{
|
{
|
||||||
/* Go on */
|
/* Go on */
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
set_brks_to(s, brks, posLast, posCur, len,
|
set_brks_to(s, brks, posLast, posCur, len,
|
||||||
WORDBREAK_BREAK, get_next_char);
|
WORDBREAK_BREAK, get_next_char);
|
||||||
wbcSeqStart = wbcCur;
|
wbcSeqStart = wbcCur;
|
||||||
posLast = posCur;
|
posLast = posCur;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case WBP_MidNum:
|
case WBP_MidNum:
|
||||||
if (wbcLast == WBP_Numeric) /* WB11,12 */
|
if (wbcLast == WBP_Numeric) /* WB11,12 */
|
||||||
{
|
{
|
||||||
/* Go on */
|
/* Go on */
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
set_brks_to(s, brks, posLast, posCur, len,
|
set_brks_to(s, brks, posLast, posCur, len,
|
||||||
WORDBREAK_BREAK, get_next_char);
|
WORDBREAK_BREAK, get_next_char);
|
||||||
wbcSeqStart = wbcCur;
|
wbcSeqStart = wbcCur;
|
||||||
posLast = posCur;
|
posLast = posCur;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case WBP_Numeric:
|
case WBP_Numeric:
|
||||||
if ((wbcSeqStart == WBP_Numeric) || /* WB8,11,12 */
|
if ((wbcSeqStart == WBP_Numeric) || /* WB8,11,12 */
|
||||||
(wbcLast == WBP_ALetter) || /* WB9 */
|
(wbcLast == WBP_ALetter) || /* WB9 */
|
||||||
(wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */
|
(wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */
|
||||||
{
|
{
|
||||||
set_brks_to(s, brks, posLast, posCur, len,
|
set_brks_to(s, brks, posLast, posCur, len,
|
||||||
WORDBREAK_NOBREAK, get_next_char);
|
WORDBREAK_NOBREAK, get_next_char);
|
||||||
}
|
}
|
||||||
/* No rule found, reset */
|
/* No rule found, reset */
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
set_brks_to(s, brks, posLast, posCur, len,
|
set_brks_to(s, brks, posLast, posCur, len,
|
||||||
WORDBREAK_BREAK, get_next_char);
|
WORDBREAK_BREAK, get_next_char);
|
||||||
}
|
}
|
||||||
wbcSeqStart = wbcCur;
|
wbcSeqStart = wbcCur;
|
||||||
posLast = posCur;
|
posLast = posCur;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case WBP_ExtendNumLet:
|
case WBP_ExtendNumLet:
|
||||||
/* WB13a,13b */
|
/* WB13a,13b */
|
||||||
if ((wbcSeqStart == wbcLast) &&
|
if ((wbcSeqStart == wbcLast) &&
|
||||||
((wbcLast == WBP_ALetter) ||
|
((wbcLast == WBP_ALetter) ||
|
||||||
(wbcLast == WBP_Numeric) ||
|
(wbcLast == WBP_Numeric) ||
|
||||||
(wbcLast == WBP_Katakana) ||
|
(wbcLast == WBP_Katakana) ||
|
||||||
(wbcLast == WBP_ExtendNumLet)))
|
(wbcLast == WBP_ExtendNumLet)))
|
||||||
{
|
{
|
||||||
set_brks_to(s, brks, posLast, posCur, len,
|
set_brks_to(s, brks, posLast, posCur, len,
|
||||||
WORDBREAK_NOBREAK, get_next_char);
|
WORDBREAK_NOBREAK, get_next_char);
|
||||||
}
|
}
|
||||||
/* No rule found, reset */
|
/* No rule found, reset */
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
set_brks_to(s, brks, posLast, posCur, len,
|
set_brks_to(s, brks, posLast, posCur, len,
|
||||||
WORDBREAK_BREAK, get_next_char);
|
WORDBREAK_BREAK, get_next_char);
|
||||||
}
|
}
|
||||||
wbcSeqStart = wbcCur;
|
wbcSeqStart = wbcCur;
|
||||||
posLast = posCur;
|
posLast = posCur;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case WBP_Regional:
|
case WBP_Regional:
|
||||||
/* WB13c */
|
/* WB13c */
|
||||||
if (wbcSeqStart == WBP_Regional)
|
if (wbcSeqStart == WBP_Regional)
|
||||||
{
|
{
|
||||||
set_brks_to(s, brks, posLast, posCur, len,
|
set_brks_to(s, brks, posLast, posCur, len,
|
||||||
WORDBREAK_NOBREAK, get_next_char);
|
WORDBREAK_NOBREAK, get_next_char);
|
||||||
}
|
}
|
||||||
wbcSeqStart = wbcCur;
|
wbcSeqStart = wbcCur;
|
||||||
posLast = posCur;
|
posLast = posCur;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case WBP_Any:
|
case WBP_Any:
|
||||||
/* Allow breaks and reset */
|
/* Allow breaks and reset */
|
||||||
set_brks_to(s, brks, posLast, posCur, len,
|
set_brks_to(s, brks, posLast, posCur, len,
|
||||||
WORDBREAK_BREAK, get_next_char);
|
WORDBREAK_BREAK, get_next_char);
|
||||||
wbcSeqStart = wbcCur;
|
wbcSeqStart = wbcCur;
|
||||||
posLast = posCur;
|
posLast = posCur;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
/* Error, should never get here! */
|
/* Error, should never get here! */
|
||||||
assert(0);
|
assert(0);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
wbcLast = wbcCur;
|
wbcLast = wbcCur;
|
||||||
posCur = posNext;
|
posCur = posNext;
|
||||||
ch = get_next_char(s, len, &posNext);
|
ch = get_next_char(s, len, &posNext);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* WB2 */
|
/* WB2 */
|
||||||
set_brks_to(s, brks, posLast, posNext, len,
|
set_brks_to(s, brks, posLast, posNext, len,
|
||||||
WORDBREAK_BREAK, get_next_char);
|
WORDBREAK_BREAK, get_next_char);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Sets the word breaking information for a UTF-8 input string.
|
* Sets the word breaking information for a UTF-8 input string.
|
||||||
*
|
*
|
||||||
* @param[in] s input UTF-8 string
|
* @param[in] s input UTF-8 string
|
||||||
* @param[in] len length of the input
|
* @param[in] len length of the input
|
||||||
* @param[in] lang language of the input
|
* @param[in] lang language of the input
|
||||||
* @param[out] brks pointer to the output breaking data, containing
|
* @param[out] brks pointer to the output breaking data, containing
|
||||||
* #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
|
* #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
|
||||||
* #WORDBREAK_INSIDEACHAR
|
* #WORDBREAK_INSIDEACHAR
|
||||||
*/
|
*/
|
||||||
void set_wordbreaks_utf8(
|
void set_wordbreaks_utf8(
|
||||||
const utf8_t *s,
|
const utf8_t *s,
|
||||||
size_t len,
|
size_t len,
|
||||||
const char *lang,
|
const char *lang,
|
||||||
char *brks)
|
char *brks)
|
||||||
{
|
{
|
||||||
set_wordbreaks(s, len, lang, brks,
|
set_wordbreaks(s, len, lang, brks,
|
||||||
(get_next_char_t)lb_get_next_char_utf8);
|
(get_next_char_t)lb_get_next_char_utf8);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Sets the word breaking information for a UTF-16 input string.
|
* Sets the word breaking information for a UTF-16 input string.
|
||||||
*
|
*
|
||||||
* @param[in] s input UTF-16 string
|
* @param[in] s input UTF-16 string
|
||||||
* @param[in] len length of the input
|
* @param[in] len length of the input
|
||||||
* @param[in] lang language of the input
|
* @param[in] lang language of the input
|
||||||
* @param[out] brks pointer to the output breaking data, containing
|
* @param[out] brks pointer to the output breaking data, containing
|
||||||
* #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
|
* #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
|
||||||
* #WORDBREAK_INSIDEACHAR
|
* #WORDBREAK_INSIDEACHAR
|
||||||
*/
|
*/
|
||||||
void set_wordbreaks_utf16(
|
void set_wordbreaks_utf16(
|
||||||
const utf16_t *s,
|
const utf16_t *s,
|
||||||
size_t len,
|
size_t len,
|
||||||
const char *lang,
|
const char *lang,
|
||||||
char *brks)
|
char *brks)
|
||||||
{
|
{
|
||||||
set_wordbreaks(s, len, lang, brks,
|
set_wordbreaks(s, len, lang, brks,
|
||||||
(get_next_char_t)lb_get_next_char_utf16);
|
(get_next_char_t)lb_get_next_char_utf16);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Sets the word breaking information for a UTF-32 input string.
|
* Sets the word breaking information for a UTF-32 input string.
|
||||||
*
|
*
|
||||||
* @param[in] s input UTF-32 string
|
* @param[in] s input UTF-32 string
|
||||||
* @param[in] len length of the input
|
* @param[in] len length of the input
|
||||||
* @param[in] lang language of the input
|
* @param[in] lang language of the input
|
||||||
* @param[out] brks pointer to the output breaking data, containing
|
* @param[out] brks pointer to the output breaking data, containing
|
||||||
* #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
|
* #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
|
||||||
* #WORDBREAK_INSIDEACHAR
|
* #WORDBREAK_INSIDEACHAR
|
||||||
*/
|
*/
|
||||||
void set_wordbreaks_utf32(
|
void set_wordbreaks_utf32(
|
||||||
const utf32_t *s,
|
const utf32_t *s,
|
||||||
size_t len,
|
size_t len,
|
||||||
const char *lang,
|
const char *lang,
|
||||||
char *brks)
|
char *brks)
|
||||||
{
|
{
|
||||||
set_wordbreaks(s, len, lang, brks,
|
set_wordbreaks(s, len, lang, brks,
|
||||||
(get_next_char_t)lb_get_next_char_utf32);
|
(get_next_char_t)lb_get_next_char_utf32);
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,10 +1,10 @@
|
||||||
/* vim: set tabstop=4 shiftwidth=4: */
|
/* vim: set expandtab tabstop=4 softtabstop=4 shiftwidth=4: */
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Word breaking in a Unicode sequence. Designed to be used in a
|
* Word breaking in a Unicode sequence. Designed to be used in a
|
||||||
* generic text renderer.
|
* generic text renderer.
|
||||||
*
|
*
|
||||||
* Copyright (C) 2012 Tom Hacohen <tom@stosb.com>
|
* Copyright (C) 2013 Tom Hacohen <tom at stosb dot com>
|
||||||
*
|
*
|
||||||
* This software is provided 'as-is', without any express or implied
|
* This software is provided 'as-is', without any express or implied
|
||||||
* warranty. In no event will the author be held liable for any damages
|
* warranty. In no event will the author be held liable for any damages
|
||||||
|
@ -24,23 +24,27 @@
|
||||||
* distribution.
|
* distribution.
|
||||||
*
|
*
|
||||||
* The main reference is Unicode Standard Annex 29 (UAX #29):
|
* The main reference is Unicode Standard Annex 29 (UAX #29):
|
||||||
* <URL:http://unicode.org/reports/tr29>
|
* <URL:http://unicode.org/reports/tr29>
|
||||||
*
|
*
|
||||||
* When this library was designed, this annex was at Revision 17, for
|
* When this library was designed, this annex was at Revision 17, for
|
||||||
* Unicode 6.0.0:
|
* Unicode 6.0.0:
|
||||||
* <URL:http://www.unicode.org/reports/tr29/tr29-17.html>
|
* <URL:http://www.unicode.org/reports/tr29/tr29-17.html>
|
||||||
|
*
|
||||||
|
* This library has been updated according to Revision 21, for
|
||||||
|
* Unicode 6.2.0:
|
||||||
|
* <URL:http://www.unicode.org/reports/tr29/tr29-21.html>
|
||||||
*
|
*
|
||||||
* The Unicode Terms of Use are available at
|
* The Unicode Terms of Use are available at
|
||||||
* <URL:http://www.unicode.org/copyright.html>
|
* <URL:http://www.unicode.org/copyright.html>
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @file wordbreak.h
|
* @file wordbreak.h
|
||||||
*
|
*
|
||||||
* Header file for the word breaking (segmentation) algorithm.
|
* Header file for the word breaking (segmentation) algorithm.
|
||||||
*
|
*
|
||||||
* @version 2.2, 2012/02/04
|
* @version 2.3, 2013/09/28
|
||||||
* @author Tom Hacohen
|
* @author Tom Hacohen
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#ifndef WORDBREAK_H
|
#ifndef WORDBREAK_H
|
||||||
|
@ -53,17 +57,17 @@
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define WORDBREAK_BREAK 0 /**< Break is allowed */
|
#define WORDBREAK_BREAK 0 /**< Break is allowed */
|
||||||
#define WORDBREAK_NOBREAK 1 /**< No break is allowed */
|
#define WORDBREAK_NOBREAK 1 /**< No break is allowed */
|
||||||
#define WORDBREAK_INSIDEACHAR 2 /**< A UTF-8/16 sequence is unfinished */
|
#define WORDBREAK_INSIDEACHAR 2 /**< A UTF-8/16 sequence is unfinished */
|
||||||
|
|
||||||
void init_wordbreak(void);
|
void init_wordbreak(void);
|
||||||
void set_wordbreaks_utf8(
|
void set_wordbreaks_utf8(
|
||||||
const utf8_t *s, size_t len, const char* lang, char *brks);
|
const utf8_t *s, size_t len, const char* lang, char *brks);
|
||||||
void set_wordbreaks_utf16(
|
void set_wordbreaks_utf16(
|
||||||
const utf16_t *s, size_t len, const char* lang, char *brks);
|
const utf16_t *s, size_t len, const char* lang, char *brks);
|
||||||
void set_wordbreaks_utf32(
|
void set_wordbreaks_utf32(
|
||||||
const utf32_t *s, size_t len, const char* lang, char *brks);
|
const utf32_t *s, size_t len, const char* lang, char *brks);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,10 +1,11 @@
|
||||||
/* vim: set tabstop=4 shiftwidth=4: */
|
/* vim: set expandtab tabstop=4 softtabstop=4 shiftwidth=4: */
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Word breaking in a Unicode sequence. Designed to be used in a
|
* Word breaking in a Unicode sequence. Designed to be used in a
|
||||||
* generic text renderer.
|
* generic text renderer.
|
||||||
*
|
*
|
||||||
* Copyright (C) 2012 Tom Hacohen <tom@stosb.com>
|
* Copyright (C) 2013 Tom Hacohen <tom at stosb dot com>
|
||||||
|
* Copyright (C) 2013 Petr Filipsky <philodej at gmail dot com>
|
||||||
*
|
*
|
||||||
* This software is provided 'as-is', without any express or implied
|
* This software is provided 'as-is', without any express or implied
|
||||||
* warranty. In no event will the author be held liable for any damages
|
* warranty. In no event will the author be held liable for any damages
|
||||||
|
@ -24,47 +25,55 @@
|
||||||
* distribution.
|
* distribution.
|
||||||
*
|
*
|
||||||
* The main reference is Unicode Standard Annex 29 (UAX #29):
|
* The main reference is Unicode Standard Annex 29 (UAX #29):
|
||||||
* <URL:http://unicode.org/reports/tr29>
|
* <URL:http://unicode.org/reports/tr29>
|
||||||
*
|
*
|
||||||
* When this library was designed, this annex was at Revision 17, for
|
* When this library was designed, this annex was at Revision 17, for
|
||||||
* Unicode 6.0.0:
|
* Unicode 6.0.0:
|
||||||
* <URL:http://www.unicode.org/reports/tr29/tr29-17.html>
|
* <URL:http://www.unicode.org/reports/tr29/tr29-17.html>
|
||||||
|
*
|
||||||
|
* This library has been updated according to Revision 21, for
|
||||||
|
* Unicode 6.2.0:
|
||||||
|
* <URL:http://www.unicode.org/reports/tr29/tr29-21.html>
|
||||||
*
|
*
|
||||||
* The Unicode Terms of Use are available at
|
* The Unicode Terms of Use are available at
|
||||||
* <URL:http://www.unicode.org/copyright.html>
|
* <URL:http://www.unicode.org/copyright.html>
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @file wordbreakdef.h
|
* @file wordbreakdef.h
|
||||||
*
|
*
|
||||||
* Definitions of internal data structures, declarations of global
|
* Definitions of internal data structures, declarations of global
|
||||||
* variables, and function prototypes for the word breaking algorithm.
|
* variables, and function prototypes for the word breaking algorithm.
|
||||||
*
|
*
|
||||||
* @version 2.2, 2013/05/14
|
* @version 2.4, 2013/11/10
|
||||||
* @author Tom Hacohen
|
* @author Tom Hacohen
|
||||||
|
* @author Petr Filipsky
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Word break classes. This is a direct mapping of Table 3 of Unicode
|
* Word break classes. This is a direct mapping of Table 3 of Unicode
|
||||||
* Standard Annex 29, Revision 17.
|
* Standard Annex 29, Revision 23.
|
||||||
*/
|
*/
|
||||||
enum WordBreakClass
|
enum WordBreakClass
|
||||||
{
|
{
|
||||||
WBP_Undefined,
|
WBP_Undefined,
|
||||||
WBP_CR,
|
WBP_CR,
|
||||||
WBP_LF,
|
WBP_LF,
|
||||||
WBP_Newline,
|
WBP_Newline,
|
||||||
WBP_Extend,
|
WBP_Extend,
|
||||||
WBP_Format,
|
WBP_Format,
|
||||||
WBP_Katakana,
|
WBP_Katakana,
|
||||||
WBP_ALetter,
|
WBP_ALetter,
|
||||||
WBP_MidNumLet,
|
WBP_MidNumLet,
|
||||||
WBP_MidLetter,
|
WBP_MidLetter,
|
||||||
WBP_MidNum,
|
WBP_MidNum,
|
||||||
WBP_Numeric,
|
WBP_Numeric,
|
||||||
WBP_ExtendNumLet,
|
WBP_ExtendNumLet,
|
||||||
WBP_Regional,
|
WBP_Regional,
|
||||||
WBP_Any
|
WBP_Hebrew,
|
||||||
|
WBP_Single,
|
||||||
|
WBP_Double,
|
||||||
|
WBP_Any
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -73,7 +82,7 @@ enum WordBreakClass
|
||||||
*/
|
*/
|
||||||
struct WordBreakProperties
|
struct WordBreakProperties
|
||||||
{
|
{
|
||||||
utf32_t start; /**< Starting coding point */
|
utf32_t start; /**< Starting coding point */
|
||||||
utf32_t end; /**< End coding point */
|
utf32_t end; /**< End coding point */
|
||||||
enum WordBreakClass prop; /**< The word breaking property */
|
enum WordBreakClass prop; /**< The word breaking property */
|
||||||
};
|
};
|
||||||
|
|
Loading…
Reference in New Issue