Evas: Added liblinebreak (static dep) to the build process.

SVN revision: 59203
This commit is contained in:
Tom Hacohen 2011-05-05 11:05:44 +00:00
parent e307051ccb
commit bf909af0f3
20 changed files with 3640 additions and 1 deletions

View File

@ -272,6 +272,33 @@ if test "x${want_fontconfig}" = "xyes" -o "x${want_fontconfig}" = "xauto" ; then
])
fi
# linebreak
have_linebreak="no"
want_linebreak="yes"
AC_ARG_ENABLE([liblinebreak],
AC_HELP_STRING([--disable-liblinebreak],
[disable linking against liblinebreak. @<:@default=enabled@:>@]),
[
if test "x${enableval}" = "xyes" ; then
want_linebreak="yes"
else
want_linebreak="no"
fi
])
AM_CONDITIONAL(EVAS_USE_LINEBREAK, test "x${want_linebreak}" = "xyes")
if test "x${want_linebreak}" = "xyes" ; then
have_linebreak="yes"
LINEBREAK_CFLAGS='-I$(top_srcdir)/src/static_deps/liblinebreak'
LINEBREAK_LIBS='$(top_builddir)/src/static_deps/liblinebreak/liblinebreak.la'
AC_SUBST(LINEBREAK_CFLAGS)
AC_SUBST(LINEBREAK_LIBS)
AC_DEFINE(HAVE_LINEBREAK, 1, [have liblinebreak support])
else
have_linebreak="no"
fi
# fribidi support
have_fribidi="no"
AC_ARG_ENABLE([fribidi],
@ -1645,6 +1672,8 @@ src/modules/savers/eet/Makefile
src/modules/savers/jpeg/Makefile
src/modules/savers/png/Makefile
src/modules/savers/tiff/Makefile
src/static_deps/Makefile
src/static_deps/liblinebreak/Makefile
src/lib/include/Makefile
src/examples/Makefile
README
@ -1742,6 +1771,7 @@ echo
echo "Font Rendering Helpers:"
echo " Fribidi.................: $have_fribidi"
echo " Harfbuzz................: $have_harfbuzz"
echo " liblinebreak............: $have_linebreak"
# FIXME: add non freetype2 font engine support
# FIXME: make freetype2 optional
echo

View File

@ -1,3 +1,3 @@
MAINTAINERCLEANFILES = Makefile.in
SUBDIRS = lib bin modules examples
SUBDIRS = static_deps lib bin modules examples

View File

@ -0,0 +1,6 @@
Wu Yongwei. Designed and implemented liblinebreak.
Nikolay Pultsin. Put forward the original requirements on liblinebreak,
performed tests, and made a lot of suggestions on the initial versions.
Thomas Klausner. Autoconfiscated and libtoolized liblinebreak.

View File

@ -0,0 +1,397 @@
2010-01-03 Wu Yongwei <wuyongwei@gmail.com>
* LICENCE: Update the copyright year.
2010-01-03 Wu Yongwei <wuyongwei@gmail.com>
* NEWS: Add information about the 2.0 release.
2010-01-03 Wu Yongwei <wuyongwei@gmail.com>
* Doxyfile (PROJECT_NUMBER): Set to `2.0'.
(HAVE_DOT): Set to `YES'.
2010-01-03 Wu Yongwei <wuyongwei@gmail.com>
* linebreak.c: Update the version number in comment to 2.0.
* linebreak.h: Ditto.
* linebreakdef.c: Ditto.
* linebreakdef.h: Ditto.
2009-12-17 Wu Yongwei <wuyongwei@gmail.com>
Change the values of enum BreakAction to the same length.
* linebreak.c (DIRECT_BRK): Rename to DIR_BRK.
(INDIRECT_BRK): Rename to IND_BRK.
(CM_INDIRECT_BRK): Rename to CMI_BRK.
(CM_PROHIBITED_BRK): Rename to CMP_BRK.
(PROHIBITED_BRK): Rename to PRH_BRK.
2009-11-29 Wu Yongwei <wuyongwei@gmail.com>
* Doxyfile (TAB_SIZE): Set to the correct size `4', as used in the
source files.
2009-11-29 Wu Yongwei <wuyongwei@gmail.com>
Update files according to UAX #14-24, for Unicode 5.2.0.
* linebreak.c: Update comments about UAX #14.
* linebreak.h: Ditto.
* linebreakdef.c: Ditto.
* linebreakdef.h: Ditto.
(LBP_CP): New enumerator for the new `CP' class as defined in
UAX #14-24.
* linebreak.c (baTable): Update for the new class `CP'.
* linebreakdata.c: Regenerate from LineBreak-5.2.0.txt.
* README: Update the reference to UAX #14-24, for Unicode 5.2.0.
2009-05-03 Wu Yongwei <wuyongwei@gmail.com>
* NEWS: Add information about the 1.2 release.
2009-04-30 Wu Yongwei <wuyongwei@gmail.com>
Optimize the Doxygen output.
* linebreak.c (lb_prop_index): Adjust its definition format
slightly.
2009-04-30 Wu Yongwei <wuyongwei@gmail.com>
* Doxyfile (USE_WINDOWS_ENCODING): Remove obsolete tag.
(DETAILS_AT_TOP): Ditto.
(MAX_DOT_GRAPH_WIDTH): Ditto.
(MAX_DOT_GRAPH_HEIGHT): Ditto.
(REFERENCED_BY_RELATION): Set to `NO'.
(REFERENCES_RELATION): Ditto.
(EXCLUDE): Add `filter_dup.c'.
2009-04-28 Wu Yongwei <wuyongwei@gmail.com>
* linebreak.c (lb_get_next_char_utf8): Fix the issue that the index
can point to the middle of a UTF-8 sequence if End of String (EOS)
is encountered prematurely (thanks to Nikolay Pultsin and Rick Xu).
(lb_get_next_char_utf16): Fix the issue that the index can point to
the middle of a UTF-16 surrogate pair if EOS is encountered
prematurely.
2009-04-20 Wu Yongwei <wuyongwei@gmail.com>
* linebreakdef.c (lb_prop_English): Remove the specialization of
right single quotation mark as closing punctuation mark, because it
can be used as apostrophe.
(lb_prop_Spanish): Ditto.
(lb_prop_French): Ditto.
2009-04-09 Wu Yongwei <wuyongwei@gmail.com>
* Makefile.msvc: Make the `clean' target work on MSVC versions other
than 6.0; do not use precompiled header.
2009-03-07 Wu Yongwei <wuyongwei@gmail.com>
* linebreak.h: Correct the wrong date in the documentation comment.
* linebreakdef.h: Ditto.
2009-02-10 Wu Yongwei <wuyongwei@gmail.com>
* configure.ac (AC_INIT): Increase the version to 2.0.
* Makefile.am (liblinebreak_la_LDFLAGS): Set the version-info to
`2:0'.
2009-02-10 Wu Yongwei <wuyongwei@gmail.com>
* linebreak.h (LINEBREAK_VERSION): New macro.
(linebreak_version): New global constant declaration.
* linebreak.c (linebreak_version): New global constant definition.
2009-02-10 Wu Yongwei <wuyongwei@gmail.com>
Reduce namespace pollution.
* linebreak.c (get_lb_prop_lang): Mark as static.
(get_next_char_utf8): Rename to lb_get_next_char_utf8.
(get_next_char_utf16): Rename to lb_get_next_char_utf32.
(get_next_char_utf32): Rename to lb_get_next_char_utf32.
(is_breakable): Rename to is_line_breakable.
* linebreak.h (get_next_char_utf8): Remove the function prototype
declaration.
(get_next_char_utf16): Ditto.
(get_next_char_utf32): Ditto.
(is_breakable): Rename to is_line_breakable.
* linebreakdef.h (lb_get_next_char_utf8): Add the function prototype
declaration.
(lb_get_next_char_utf16): Ditto.
(lb_get_next_char_utf32): Ditto.
2009-02-06 Wu Yongwei <wuyongwei@gmail.com>
* NEWS: Add information about the 1.1 release.
2009-01-02 Wu Yongwei <wuyongwei@gmail.com>
* Makefile.am (EXTRA_DIST): Add the missing `LICENCE' file.
2008-12-31 Wu Yongwei <wuyongwei@gmail.com>
* linebreak.c: Update the version number in comment to 1.0.
* linebreak.h: Ditto.
* linebreakdef.c: Ditto.
* linebreakdef.h: Ditto.
2008-12-31 Wu Yongwei <wuyongwei@gmail.com>
* NEWS: Update for the 1.0 release.
2008-12-31 Wu Yongwei <wuyongwei@gmail.com>
* README: Correct two typos.
2008-12-31 Wu Yongwei <wuyongwei@gmail.com>
* README: Add the online URL reference.
2008-12-30 Wu Yongwei <wuyongwei@gmail.com>
* README: Update the reference to UAX #14-22, for Unicode 5.1.0.
2008-12-13 Wu Yongwei <wuyongwei@gmail.com>
Update files according to UAX #14-22, for Unicode 5.1.0.
* linebreak.c (baTable): Update according to Table 2 of UAX #14-22.
* linebreakdef.c (lb_prop_Spanish): Remove the unnecessary
customization for inverted marks in Spanish.
* linebreakdata.c: Regenerate from LineBreak-5.1.0.txt.
* linebreak.h: Update comment only.
* linebreakdef.h: Ditto.
2008-12-12 Wu Yongwei <wuyongwei@gmail.com>
* README: Update for the new build methods and better readability.
2008-12-12 Wu Yongwei <wuyongwei@gmail.com>
* Makefile.msvc: Correct the inconsistent naming in the output
message.
2008-12-12 Wu Yongwei <wuyongwei@gmail.com>
* configure.ac (AM_INIT_AUTOMAKE): Mark `foreign'.
* bootstrap: New file.
* purge: New file.
* Makefile.gcc (purge): Remove this target.
2008-12-10 Wu Yongwei <wuyongwei@gmail.com>
* NEWS: New file.
2008-12-10 Wu Yongwei <wuyongwei@gmail.com>
* AUTHORS: New file.
2008-12-10 Wu Yongwei <wuyongwei@gmail.com>
* Makefile.gcc (purge): New phony target to purge files generated by
autoconfiscation.
2008-12-10 Thomas Klausner <tk@giga.or.at>
* configure.ac: New file.
* Makefile.am: New file.
2008-12-10 Wu Yongwei <wuyongwei@gmail.com>
* Doxyfile (OUTPUT_DIRECTORY): Set to `doc'.
(ALPHABETICAL_INDEX): Set to `YES'.
2008-12-09 Wu Yongwei <wuyongwei@gmail.com>
* Makefile.msvc: New file.
2008-12-09 Wu Yongwei <wuyongwei@gmail.com>
* Makefile: Remove (to become Makefile.gcc).
* Makefile.gcc: New file (was Makefile).
2008-12-07 Wu Yongwei <wuyongwei@gmail.com>
* linebreak.c: Adjust the comment that refers to Unicode Annex 14.
* linebreak.h: Ditto.
* linebreakdef.c: Ditto.
* linebreakdef.h: Ditto.
2008-12-07 Wu Yongwei <wuyongwei@gmail.com>
Use only POSIX basic regexp to ensure maximum portability (issues
have been found on Mac OS X, where GNU extensions do not work).
* LineBreak1.sed: Replace `[:xdigit:]' with `0-9A-F', and `\+' with
`\{1,\}'.
* LineBreak2.sed: Ditto.
2008-12-07 Wu Yongwei <wuyongwei@gmail.com>
* Makefile: Replace `*.exe' with `filter_dup$(EXEEXT)', since the
extension `.exe' is specific to Windows.
2008-04-20 Wu Yongwei <wuyongwei@gmail.com>
Add README and LICENCE files, as well as a Doxyfile to generate
documents.
* README: New file.
* LICENCE: New file.
* Doxyfile: New file.
* Makefile (doc): Add new phony target.
2008-04-04 Wu Yongwei <wuyongwei@gmail.com>
Remove the English override for plus sign: it is better treated in
the text breaking program (see ../breaktext/ for an example).
* linebreakdef.c (lb_prop_English): Remove the line for plus sign.
2008-03-29 Wu Yongwei <wuyongwei@gmail.com>
* Makefile: Correct the dependency-making rules when OLDGCC=Y.
2008-03-23 Wu Yongwei <wuyongwei@gmail.com>
* Makefile (clean): Do not remove *.exe and tags here.
(distclean): Remove *.exe and tags.
2008-03-23 Wu Yongwei <wuyongwei@gmail.com>
Remove the English override for solidus: it is better treated in the
text breaking program (see ../breaktext/ for an example).
* linebreakdef.c (lb_prop_English): Remove the line for solidus.
2008-03-16 Wu Yongwei <wuyongwei@gmail.com>
Rename init_linebreak_prop_index to init_linebreak for future
safety; make visible certain functions that are potentially useful.
* linebreak.c (init_linebreak_prop_index): Rename to init_linebreak.
(get_next_char_t): Move to linebreakdef.h.
(get_next_char_utf8): Make non-static.
(get_next_char_utf16): Ditto.
(get_next_char_utf32): Ditto.
(set_linebreaks): Ditto.
* linebreak.h (init_linebreak_prop_index): Rename to init_linebreak.
(get_next_char_utf8): Add the function prototype.
(get_next_char_utf16): Ditto.
(get_next_char_utf32): Ditto.
* linebreakdef.h (get_next_char_t): Add the typedef.
(set_linebreaks): Add the function prototype.
2008-03-16 Wu Yongwei <wuyongwei@gmail.com>
* Makefile (OLDGCC): Add support for GCC 2.95.3 (when OLDGCC=Y).
2008-03-15 Wu Yongwei <wuyongwei@gmail.com>
* linebreak.c (set_linebreaks): Fix a bug that `==' was wrongly used
for `='.
2008-03-05 Wu Yongwei <wuyongwei@gmail.com>
Improve the performance by reducing the look-ups of the
language-specific line breaking properties array from the language
name (thanks to Nikolay Pultsin).
* linebreak.c (get_lb_prop_lang): New function.
(get_char_lb_class_lang): Change the second parameter from the
language name to the line breaking properties array.
(set_linebreaks): Look up the language-specific line breaking
properties array from the language name only once in one function
call.
2008-03-03 Wu Yongwei <wuyongwei@gmail.com>
Make minor adjustments in code and comments.
* linebreak.c: Adjust the doc comments.
(init_linebreak_prop_index): Modify a conditional to make it more
robust and consistent.
* linebreakdef.c (lb_prop_lang_map): Replace the pointer
lb_prop_default with NULL, since the value is never used.
2008-03-03 Wu Yongwei <wuyongwei@gmail.com>
Accelerate get_char_lb_class for invalid Unicode code points.
* linebreak.c (get_char_lb_class): Adjust the conditionals so that
getting the line breaking class for an invalid code point is much
faster, which requires the array of line breaking properties be
sorted.
* linebreakdef.h: Adjust a comment that the array of line break
properties must be sorted.
2008-03-02 Wu Yongwei <wuyongwei@gmail.com>
Change the values of enum BreakAction to more complete forms.
* linebreak.c (INDRCT_BRK): Rename to INDIRECT_BRK.
(CM_INDRCT_BRK): Rename to CM_INDIRECT_BRK.
(CM_PROHIBTD_BRK): Rename to CM_PROHIBITED_BRK.
(PROHIBTD_BRK): Rename to PROHIBITED_BRK.
2008-03-02 Wu Yongwei <wuyongwei@gmail.com>
Implement a two-stage search in get_char_lb_class_default to
accelerate the overall performance, especially for non-Latin
languages.
* linebreak.c (LINEBREAK_INDEX_SIZE): New constant macro.
(struct LineBreakPropertiesIndex): New struct.
(lb_prop_index): New static variable.
(init_linebreak_prop_index): New function.
(get_char_lb_class_default): New function.
(get_char_lb_class_lang): Use get_char_lb_class_default.
* linebreak.h: Detect C++ and add extern "C" guard if necessary.
(init_linebreak_prop_index): Add the prototype declaration.
* linebreakdef.h: Adjust a comment.
2008-03-02 Wu Yongwei <wuyongwei@gmail.com>
Split/refactor the code; add (doc) comments.
* Makefile (CFILES): Add linebreakdata.c and linebreakdef.c.
* linebreak.c: Add and adjust comments.
(linebreakdef.h): Add include file.
(linebreakdata.c): Remove include file.
(EOS): Remove (now in linebreakdef.h).
(enum LineBreakClass): Ditto.
(struct LineBreakProperties): Ditto.
(lbpEnglish): Remove (now in linebreakdef.c as lb_prop_English).
(lbpGerman): Remove (now in linebreakdef.c as lb_prop_German).
(lbpSpanish): Remove (now in linebreakdef.c as lb_prop_Spanish).
(lbpFrench): Remove (now in linebreakdef.c as lb_prop_French).
(lbpRussian): Remove (now in linebreakdef.c as lb_prop_Russian).
(lbpChinese): Remove (now in linebreakdef.c as lb_prop_Chinese).
(struct LineBreakPropertiesLang): Remove (now in linebreakdef.h).
(lbpLangs): Remove (now in linebreakdef.c as lb_prop_lang_map).
(get_next_char_utf16): Make sure memory access not go beyond len.
* linebreak.h: Add copyright information and adjust comments.
(stddef.h): Add include file.
* linebreakdata.c (linebreak.h): Add include file.
(linebreakdef.h): Add include file.
(lbpDefault): Make global and rename to lb_prop_default.
* linebreakdata2.tmpl: Add two include files, a comment line, and
remove `static'.
* linebreakdef.c: New file.
* linebreakdef.h: New file.
2008-02-26 Wu Yongwei <wuyongwei@gmail.com>
* linebreak.c (lbpSpanish): New array for Spanish-specific data.
(lbpLangs): Update the index array for Spanish.
(resolve_lb_class): Resolve AmbIguous class to IDeographic in
Chinese, Japanese, and Korean.
2008-02-26 Wu Yongwei <wuyongwei@gmail.com>
* Makefile (LineBreak.txt): Add new rule to retrieve it from the Web
if it is not already there.
2008-02-23 Wu Yongwei <wuyongwei@gmail.com>
Add files for linebreak.
* LineBreak1.sed: New file.
* LineBreak2.sed: New file.
* Makefile: New file.
* filter_dup.c: New file.
* linebreak.c: New file.
* linebreak.h: New file.
* linebreakdata.c: New file.
* linebreakdata1.tmpl: New file.
* linebreakdata2.tmpl: New file.
* linebreakdata3.tmpl: New file.

View File

@ -0,0 +1,18 @@
Copyright (C) 2008-2010 Wu Yongwei <wuyongwei at gmail dot com>
This software is provided 'as-is', without any express or implied
warranty. In no event will the author be held liable for any damages
arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not
claim that you wrote the original software. If you use this software
in a product, an acknowledgement in the product documentation would
be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not
be misrepresented as being the original software.
3. This notice may not be removed or altered from any source
distribution.

View File

@ -0,0 +1 @@
s/\(^[0-9A-F.]\{1,\};[A-Z][A-Z0-9]\) #.*/\1/p

View File

@ -0,0 +1,2 @@
s/^\([0-9A-F]\{1,\}\);/\1..\1;/
s/^\([0-9A-F]\{1,\}\)\.\.\([0-9A-F]\{1,\}\);\([A-Z][A-Z0-9]\)/ { 0x\1, 0x\2, LBP_\3 },/

View File

@ -0,0 +1,16 @@
#noinst_PROGRAMS = filter_dup
include_HEADERS = linebreak.h linebreakdef.h
noinst_LTLIBRARIES = liblinebreak.la
liblinebreak_la_SOURCES = \
linebreak.c \
linebreakdata.c \
linebreakdef.c
EXTRA_DIST = \
LineBreak1.sed \
LineBreak2.sed \
linebreakdata1.tmpl \
linebreakdata2.tmpl \
linebreakdata3.tmpl \
LICENCE

View File

@ -0,0 +1,37 @@
New in 2.0
- Update the algorithm and data according to UAX #14-24 and
LineBreak-5.2.0.txt
- Rename some functions to reduce namespace pollution
- Make Doxygen documentation better
New in 1.2
- Fix the bug that an assertion in code can fail if an invalid UTF-8 or
UTF-16 sequence is encountered near the end of input
- Remove the specialization of right single quotation mark as closing
punctuation mark in English, French, and Spanish, because it can be
used as apostrophe
- Make Doxygen documentation better
New in 1.1
- Make get_lb_prop_lang static and not an exported symbol
- Define is_line_breakable to alias to is_breakable
- Declare get_next_char_utf* will be changed to lb_get_next_char_utf*
- Move the declarations of get_next_char_utf* from linebreak.h to
linebreakdef.h
- Add the function documentation comments to the header files
New in 1.0
- Update the line breaking data according to UAX #14-22 and
LineBreak-5.1.0.txt
- Add autoconfiscation support (./configure, make, make install)
- Add Makefile for MSVC
First public release (0.9.6, or 20080421)
- Implement line breaking algorithm according to UAX #14-19
- Line breaking data is generated from LineBreak-5.0.0.txt
- Makefile only supports GCC

View File

@ -0,0 +1,86 @@
NOTICE: This is the original version, that was adapted a bit (mostly
build related) in order to work nicely with Evas.
L I B L I N E B R E A K
=======================
Overview
--------
This is the README file for liblinebreak, an implementation of the line
breaking algorithm as described in Unicode 5.2.0 Standard Annex 14,
Revision 24, available at
<URL:http://www.unicode.org/reports/tr14/tr14-24.html>
Check this URL for up-to-date information:
<URL:http://vimgadgets.sourceforge.net/liblinebreak/>
Licence
-------
This library is released under an open-source licence, the zlib/libpng
licence. Please check the file LICENCE for details.
Apart from using the algorithm, part of the code is derived from the
data provided under
<URL:http://www.unicode.org/Public/>
And the Unicode Terms of Use may apply:
<URL:http://www.unicode.org/copyright.html>
Installation
------------
There are three ways to build the library:
1) On *NIX systems supported by the autoconfiscation tools, do the
normal
./configure
make
sudo make install
to build and install both the dynamic and static libraries. In
addition, one may
- type `make doc' to generate the doxygen documentation; or
- type `make linebreakdata' to regenerate linebreakdata.c from
LineBreak.txt.
2) On systems where GCC and Binutils are supported, one can type
cp -p Makefile.gcc Makefile
make
to build the static library. In addition, one may
- type `make debug' or `make release' to explicitly generate the
debug or release build;
- type `make doc' to generate the doxygen documentation; or
- type `make linebreakdata' to regenerate linebreakdata.c from
LineBreak.txt.
3) On Windows, apart from using method 1 (Cygwin/MSYS) and method 2
(MinGW), MSVC can also be used. Type
nmake -f Makefile.msvc
to build the static library. By default the debug release is built.
To build the release version
nmake -f Makefile.msvc CFG="linebreak - Win32 Release"
Documentation
-------------
Check the generated document doc/html/linebreak_8h.html for the public
interfaces exposed to applications.
$Id: README,v 1.6 2009/11/29 08:09:13 adah Exp $
vim:autoindent:expandtab:formatoptions=tcqlmn:textwidth=72:

View File

@ -0,0 +1,48 @@
#include <stdio.h>
#include <string.h>
int main()
{
char s[80];
char beg[16];
char end[16];
char prop[16];
char lastbeg[16];
char lastend[16];
char lastprop[16];
lastprop[0] = 0;
for (;;)
{
if (fgets(s, sizeof s, stdin) == NULL)
break;
if (strstr(s, "LBP_") == NULL || strstr(s, "LBP_Undef") != NULL)
{
if (lastprop[0])
{
printf("\t{ %s %s %s },\n", lastbeg, lastend, lastprop);
lastprop[0] = 0;
}
printf("%s", s);
continue;
}
sscanf(s, "\t{ %s %s %s }", beg, end, prop);
/*printf("==>\t{ \"%s\" \"%s\" \"%s\" },\n", beg, end, prop);*/
if (lastprop[0] && strcmp(lastprop, prop) != 0)
{
printf("\t{ %s %s %s },\n", lastbeg, lastend, lastprop);
lastprop[0] = 0;
}
if (lastprop[0] == 0)
{
strcpy(lastbeg, beg);
strcpy(lastprop, prop);
}
strcpy(lastend, end);
}
if (lastprop[0])
{
printf("\t{ %s %s %s },\n", lastbeg, lastend, prop);
}
return 0;
}

View File

@ -0,0 +1,11 @@
#!/bin/sh
if [ ! -f "LineBreak.txt" ]; then
wget http://unicode.org/Public/UNIDATA/LineBreak.txt
fi
sed -n -f LineBreak1.sed LineBreak.txt > tmp.txt
sed -f LineBreak2.sed tmp.txt | ./filter_dup > tmp.c
head -2 LineBreak.txt > tmp.txt
cat linebreakdata1.tmpl tmp.txt linebreakdata2.tmpl tmp.c linebreakdata3.tmpl > linebreakdata.c
rm tmp.txt tmp.c

View File

@ -0,0 +1,734 @@
/* vim: set tabstop=4 shiftwidth=4: */
/*
* Line breaking in a Unicode sequence. Designed to be used in a
* generic text renderer.
*
* Copyright (C) 2008-2010 Wu Yongwei <wuyongwei at gmail dot com>
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the author be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute
* it freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must
* not claim that you wrote the original software. If you use this
* software in a product, an acknowledgement in the product
* documentation would be appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must
* not be misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source
* distribution.
*
* The main reference is Unicode Standard Annex 14 (UAX #14):
* <URL:http://www.unicode.org/reports/tr14/>
*
* When this library was designed, this annex was at Revision 19, for
* Unicode 5.0.0:
* <URL:http://www.unicode.org/reports/tr14/tr14-19.html>
*
* This library has been updated according to Revision 24, for
* Unicode 5.2.0:
* <URL:http://www.unicode.org/reports/tr14/tr14-24.html>
*
* The Unicode Terms of Use are available at
* <URL:http://www.unicode.org/copyright.html>
*/
/**
* @file linebreak.c
*
* Implementation of the line breaking algorithm as described in Unicode
* Standard Annex 14.
*
* @version 2.0, 2010/01/03
* @author Wu Yongwei
*/
#include <assert.h>
#include <stddef.h>
#include <string.h>
#include "linebreak.h"
#include "linebreakdef.h"
/**
* Size of the second-level index to the line breaking properties.
*/
#define LINEBREAK_INDEX_SIZE 40
/**
* Version number of the library.
*/
const int linebreak_version = LINEBREAK_VERSION;
/**
* Enumeration of break actions. They are used in the break action
* pair table below.
*/
enum BreakAction
{
DIR_BRK, /**< Direct break opportunity */
IND_BRK, /**< Indirect break opportunity */
CMI_BRK, /**< Indirect break opportunity for combining marks */
CMP_BRK, /**< Prohibited break for combining marks */
PRH_BRK /**< Prohibited break */
};
/**
* Break action pair table. This is a direct mapping of Table 2 of
* Unicode Standard Annex 14, Revision 24.
*/
static enum BreakAction baTable[LBP_JT][LBP_JT] = {
{ /* OP */
PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, CMP_BRK,
PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK },
{ /* CL */
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, PRH_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
{ /* CP */
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, PRH_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK,
DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
{ /* QU */
PRH_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, CMI_BRK,
PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK },
{ /* GL */
IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, CMI_BRK,
PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK },
{ /* NS */
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
{ /* EX */
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
{ /* SY */
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK,
DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
{ /* IS */
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK,
DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
{ /* PR */
IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK },
{ /* PO */
IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK,
DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
{ /* NU */
IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK,
IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
{ /* AL */
IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK,
IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
{ /* ID */
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
{ /* IN */
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
{ /* HY */
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, DIR_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK,
DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
{ /* BA */
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, DIR_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
{ /* BB */
IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, CMI_BRK,
PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK },
{ /* B2 */
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, PRH_BRK, PRH_BRK, CMI_BRK,
PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
{ /* ZW */
DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, PRH_BRK, DIR_BRK,
DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
{ /* CM */
IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK,
IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
{ /* WJ */
IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, CMI_BRK,
PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK },
{ /* H2 */
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK },
{ /* H3 */
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK },
{ /* JL */
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK },
{ /* JV */
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK },
{ /* JT */
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK }
};
/**
* Struct for the second-level index to the line breaking properties.
*/
struct LineBreakPropertiesIndex
{
utf32_t end; /**< End coding point */
struct LineBreakProperties *lbp;/**< Pointer to line breaking properties */
};
/**
* Second-level index to the line breaking properties.
*/
static struct LineBreakPropertiesIndex lb_prop_index[LINEBREAK_INDEX_SIZE] =
{
{ 0xFFFFFFFF, lb_prop_default }
};
/**
* Initializes the second-level index to the line breaking properties.
* If it is not called, the performance of #get_char_lb_class_lang (and
* thus the main functionality) can be pretty bad, especially for big
* code points like those of Chinese.
*/
void init_linebreak(void)
{
size_t i;
size_t iPropDefault;
size_t len;
size_t step;
len = 0;
while (lb_prop_default[len].prop != LBP_Undefined)
++len;
step = len / LINEBREAK_INDEX_SIZE;
iPropDefault = 0;
for (i = 0; i < LINEBREAK_INDEX_SIZE; ++i)
{
lb_prop_index[i].lbp = lb_prop_default + iPropDefault;
iPropDefault += step;
lb_prop_index[i].end = lb_prop_default[iPropDefault].start - 1;
}
lb_prop_index[--i].end = 0xFFFFFFFF;
}
/**
* Gets the language-specific line breaking properties.
*
* @param lang language of the text
* @return pointer to the language-specific line breaking
* properties array if found; \c NULL otherwise
*/
static struct LineBreakProperties *get_lb_prop_lang(const char *lang)
{
struct LineBreakPropertiesLang *lbplIter;
if (lang != NULL)
{
for (lbplIter = lb_prop_lang_map; lbplIter->lang != NULL; ++lbplIter)
{
if (strncmp(lang, lbplIter->lang, lbplIter->namelen) == 0)
{
return lbplIter->lbp;
}
}
}
return NULL;
}
/**
* Gets the line breaking class of a character from a line breaking
* properties array.
*
* @param ch character to check
* @param lbp pointer to the line breaking properties array
* @return the line breaking class if found; \c LBP_XX otherwise
*/
static enum LineBreakClass get_char_lb_class(
utf32_t ch,
struct LineBreakProperties *lbp)
{
while (lbp->prop != LBP_Undefined && ch >= lbp->start)
{
if (ch <= lbp->end)
return lbp->prop;
++lbp;
}
return LBP_XX;
}
/**
* Gets the line breaking class of a character from the default line
* breaking properties array.
*
* @param ch character to check
* @return the line breaking class if found; \c LBP_XX otherwise
*/
static enum LineBreakClass get_char_lb_class_default(
utf32_t ch)
{
size_t i = 0;
while (ch > lb_prop_index[i].end)
++i;
assert(i < LINEBREAK_INDEX_SIZE);
return get_char_lb_class(ch, lb_prop_index[i].lbp);
}
/**
* Gets the line breaking class of a character for a specific
* language. This function will check the language-specific data first,
* and then the default data if there is no language-specific property
* available for the character.
*
* @param ch character to check
* @param lbpLang pointer to the language-specific line breaking
* properties array
* @return the line breaking class if found; \c LBP_XX
* otherwise
*/
static enum LineBreakClass get_char_lb_class_lang(
utf32_t ch,
struct LineBreakProperties *lbpLang)
{
enum LineBreakClass lbcResult;
/* Find the language-specific line breaking class for a character */
if (lbpLang)
{
lbcResult = get_char_lb_class(ch, lbpLang);
if (lbcResult != LBP_XX)
return lbcResult;
}
/* Find the generic language-specific line breaking class, if no
* language context is provided, or language-specific data are not
* available for the specific character in the specified language */
return get_char_lb_class_default(ch);
}
/**
* Resolves the line breaking class for certain ambiguous or complicated
* characters. They are treated in a simplistic way in this
* implementation.
*
* @param lbc line breaking class to resolve
* @param lang language of the text
* @return the resolved line breaking class
*/
static enum LineBreakClass resolve_lb_class(
enum LineBreakClass lbc,
const char *lang)
{
switch (lbc)
{
case LBP_AI:
if (lang != NULL &&
(strncmp(lang, "zh", 2) == 0 || /* Chinese */
strncmp(lang, "ja", 2) == 0 || /* Japanese */
strncmp(lang, "ko", 2) == 0)) /* Korean */
{
return LBP_ID;
}
/* Fall through */
case LBP_SA:
case LBP_SG:
case LBP_XX:
return LBP_AL;
default:
return lbc;
}
}
/**
* Gets the next Unicode character in a UTF-8 sequence. The index will
* be advanced to the next complete character, unless the end of string
* is reached in the middle of a UTF-8 sequence.
*
* @param[in] s input UTF-8 string
* @param[in] len length of the string in bytes
* @param[in,out] ip pointer to the index
* @return the Unicode character beginning at the index; or
* #EOS if end of input is encountered
*/
utf32_t lb_get_next_char_utf8(
const utf8_t *s,
size_t len,
size_t *ip)
{
utf8_t ch;
utf32_t res;
assert(*ip <= len);
if (*ip == len)
return EOS;
ch = s[*ip];
if (ch < 0xC2 || ch > 0xF4)
{ /* One-byte sequence, tail (should not occur), or invalid */
*ip += 1;
return ch;
}
else if (ch < 0xE0)
{ /* Two-byte sequence */
if (*ip + 2 > len)
return EOS;
res = ((ch & 0x1F) << 6) + (s[*ip + 1] & 0x3F);
*ip += 2;
return res;
}
else if (ch < 0xF0)
{ /* Three-byte sequence */
if (*ip + 3 > len)
return EOS;
res = ((ch & 0x0F) << 12) +
((s[*ip + 1] & 0x3F) << 6) +
((s[*ip + 2] & 0x3F));
*ip += 3;
return res;
}
else
{ /* Four-byte sequence */
if (*ip + 4 > len)
return EOS;
res = ((ch & 0x07) << 18) +
((s[*ip + 1] & 0x3F) << 12) +
((s[*ip + 2] & 0x3F) << 6) +
((s[*ip + 3] & 0x3F));
*ip += 4;
return res;
}
}
/**
* Gets the next Unicode character in a UTF-16 sequence. The index will
* be advanced to the next complete character, unless the end of string
* is reached in the middle of a UTF-16 surrogate pair.
*
* @param[in] s input UTF-16 string
* @param[in] len length of the string in words
* @param[in,out] ip pointer to the index
* @return the Unicode character beginning at the index; or
* #EOS if end of input is encountered
*/
utf32_t lb_get_next_char_utf16(
const utf16_t *s,
size_t len,
size_t *ip)
{
utf16_t ch;
assert(*ip <= len);
if (*ip == len)
return EOS;
ch = s[(*ip)++];
if (ch < 0xD800 || ch > 0xDBFF)
{ /* If the character is not a high surrogate */
return ch;
}
if (*ip == len)
{ /* If the input ends here (an error) */
--(*ip);
return EOS;
}
if (s[*ip] < 0xDC00 || s[*ip] > 0xDFFF)
{ /* If the next character is not the low surrogate (an error) */
return ch;
}
/* Return the constructed character and advance the index again */
return (((utf32_t)ch & 0x3FF) << 10) + (s[(*ip)++] & 0x3FF) + 0x10000;
}
/**
* Gets the next Unicode character in a UTF-32 sequence. The index will
* be advanced to the next character.
*
* @param[in] s input UTF-32 string
* @param[in] len length of the string in dwords
* @param[in,out] ip pointer to the index
* @return the Unicode character beginning at the index; or
* #EOS if end of input is encountered
*/
utf32_t lb_get_next_char_utf32(
const utf32_t *s,
size_t len,
size_t *ip)
{
assert(*ip <= len);
if (*ip == len)
return EOS;
return s[(*ip)++];
}
/**
* Sets the line breaking information for a generic input string.
*
* @param[in] s input string
* @param[in] len length of the input
* @param[in] lang language of the input
* @param[out] brks pointer to the output breaking data,
* containing #LINEBREAK_MUSTBREAK,
* #LINEBREAK_ALLOWBREAK, #LINEBREAK_NOBREAK,
* or #LINEBREAK_INSIDEACHAR
* @param[in] get_next_char function to get the next UTF-32 character
*/
void set_linebreaks(
const void *s,
size_t len,
const char *lang,
char *brks,
get_next_char_t get_next_char)
{
utf32_t ch;
enum LineBreakClass lbcCur;
enum LineBreakClass lbcNew;
enum LineBreakClass lbcLast;
struct LineBreakProperties *lbpLang;
size_t posCur = 0;
size_t posLast = 0;
--posLast; /* To be ++'d later */
ch = get_next_char(s, len, &posCur);
if (ch == EOS)
return;
lbpLang = get_lb_prop_lang(lang);
lbcCur = resolve_lb_class(get_char_lb_class_lang(ch, lbpLang), lang);
lbcNew = LBP_Undefined;
nextline:
/* Special treatment for the first character */
switch (lbcCur)
{
case LBP_LF:
case LBP_NL:
lbcCur = LBP_BK;
break;
case LBP_SP:
lbcCur = LBP_WJ;
break;
default:
break;
}
/* Process a line till an explicit break or end of string */
for (;;)
{
for (++posLast; posLast < posCur - 1; ++posLast)
{
brks[posLast] = LINEBREAK_INSIDEACHAR;
}
assert(posLast == posCur - 1);
lbcLast = lbcNew;
ch = get_next_char(s, len, &posCur);
if (ch == EOS)
break;
lbcNew = get_char_lb_class_lang(ch, lbpLang);
if (lbcCur == LBP_BK || (lbcCur == LBP_CR && lbcNew != LBP_LF))
{
brks[posLast] = LINEBREAK_MUSTBREAK;
lbcCur = resolve_lb_class(lbcNew, lang);
goto nextline;
}
switch (lbcNew)
{
case LBP_SP:
brks[posLast] = LINEBREAK_NOBREAK;
continue;
case LBP_BK:
case LBP_LF:
case LBP_NL:
brks[posLast] = LINEBREAK_NOBREAK;
lbcCur = LBP_BK;
continue;
case LBP_CR:
brks[posLast] = LINEBREAK_NOBREAK;
lbcCur = LBP_CR;
continue;
case LBP_CB:
brks[posLast] = LINEBREAK_ALLOWBREAK;
lbcCur = LBP_BA;
continue;
default:
break;
}
lbcNew = resolve_lb_class(lbcNew, lang);
assert(lbcCur <= LBP_JT);
assert(lbcNew <= LBP_JT);
switch (baTable[lbcCur - 1][lbcNew - 1])
{
case DIR_BRK:
brks[posLast] = LINEBREAK_ALLOWBREAK;
break;
case CMI_BRK:
case IND_BRK:
if (lbcLast == LBP_SP)
{
brks[posLast] = LINEBREAK_ALLOWBREAK;
}
else
{
brks[posLast] = LINEBREAK_NOBREAK;
}
break;
case CMP_BRK:
brks[posLast] = LINEBREAK_NOBREAK;
if (lbcLast != LBP_SP)
continue;
break;
case PRH_BRK:
brks[posLast] = LINEBREAK_NOBREAK;
break;
}
lbcCur = lbcNew;
}
assert(posLast == posCur - 1 && posCur <= len);
/* Break after the last character */
brks[posLast] = LINEBREAK_MUSTBREAK;
/* When the input contains incomplete sequences */
while (posCur < len)
{
brks[posCur++] = LINEBREAK_INSIDEACHAR;
}
}
/**
* Sets the line breaking information for a UTF-8 input string.
*
* @param[in] s input UTF-8 string
* @param[in] len length of the input
* @param[in] lang language of the input
* @param[out] brks pointer to the output breaking data, containing
* #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
* #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
*/
void set_linebreaks_utf8(
const utf8_t *s,
size_t len,
const char *lang,
char *brks)
{
set_linebreaks(s, len, lang, brks,
(get_next_char_t)lb_get_next_char_utf8);
}
/**
* Sets the line breaking information for a UTF-16 input string.
*
* @param[in] s input UTF-16 string
* @param[in] len length of the input
* @param[in] lang language of the input
* @param[out] brks pointer to the output breaking data, containing
* #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
* #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
*/
void set_linebreaks_utf16(
const utf16_t *s,
size_t len,
const char *lang,
char *brks)
{
set_linebreaks(s, len, lang, brks,
(get_next_char_t)lb_get_next_char_utf16);
}
/**
* Sets the line breaking information for a UTF-32 input string.
*
* @param[in] s input UTF-32 string
* @param[in] len length of the input
* @param[in] lang language of the input
* @param[out] brks pointer to the output breaking data, containing
* #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
* #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
*/
void set_linebreaks_utf32(
const utf32_t *s,
size_t len,
const char *lang,
char *brks)
{
set_linebreaks(s, len, lang, brks,
(get_next_char_t)lb_get_next_char_utf32);
}
/**
* Tells whether a line break can occur between two Unicode characters.
* This is a wrapper function to expose a simple interface. Generally
* speaking, it is better to use #set_linebreaks_utf32 instead, since
* complicated cases involving combining marks, spaces, etc. cannot be
* correctly processed.
*
* @param char1 the first Unicode character
* @param char2 the second Unicode character
* @param lang language of the input
* @return one of #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
* #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
*/
int is_line_breakable(
utf32_t char1,
utf32_t char2,
const char* lang)
{
utf32_t s[2];
char brks[2];
s[0] = char1;
s[1] = char2;
set_linebreaks_utf32(s, 2, lang, brks);
return brks[0];
}

View File

@ -0,0 +1,87 @@
/* vim: set tabstop=4 shiftwidth=4: */
/*
* Line breaking in a Unicode sequence. Designed to be used in a
* generic text renderer.
*
* Copyright (C) 2008-2010 Wu Yongwei <wuyongwei at gmail dot com>
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the author be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute
* it freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must
* not claim that you wrote the original software. If you use this
* software in a product, an acknowledgement in the product
* documentation would be appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must
* not be misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source
* distribution.
*
* The main reference is Unicode Standard Annex 14 (UAX #14):
* <URL:http://www.unicode.org/reports/tr14/>
*
* When this library was designed, this annex was at Revision 19, for
* Unicode 5.0.0:
* <URL:http://www.unicode.org/reports/tr14/tr14-19.html>
*
* This library has been updated according to Revision 24, for
* Unicode 5.2.0:
* <URL:http://www.unicode.org/reports/tr14/tr14-24.html>
*
* The Unicode Terms of Use are available at
* <URL:http://www.unicode.org/copyright.html>
*/
/**
* @file linebreak.h
*
* Header file for the line breaking algorithm.
*
* @version 2.0, 2010/01/03
* @author Wu Yongwei
*/
#ifndef LINEBREAK_H
#define LINEBREAK_H
#include <stddef.h>
#ifdef __cplusplus
extern "C" {
#endif
#define LINEBREAK_VERSION 0x0200 /**< Version of the library linebreak */
extern const int linebreak_version;
#ifndef LINEBREAK_UTF_TYPES_DEFINED
#define LINEBREAK_UTF_TYPES_DEFINED
typedef unsigned char utf8_t; /**< Type for UTF-8 data points */
typedef unsigned short utf16_t; /**< Type for UTF-16 data points */
typedef unsigned int utf32_t; /**< Type for UTF-32 data points */
#endif
#define LINEBREAK_MUSTBREAK 0 /**< Break is mandatory */
#define LINEBREAK_ALLOWBREAK 1 /**< Break is allowed */
#define LINEBREAK_NOBREAK 2 /**< No break is possible */
#define LINEBREAK_INSIDEACHAR 3 /**< A UTF-8/16 sequence is unfinished */
void init_linebreak(void);
void set_linebreaks_utf8(
const utf8_t *s, size_t len, const char* lang, char *brks);
void set_linebreaks_utf16(
const utf16_t *s, size_t len, const char* lang, char *brks);
void set_linebreaks_utf32(
const utf32_t *s, size_t len, const char* lang, char *brks);
int is_line_breakable(utf32_t char1, utf32_t char2, const char* lang);
#ifdef __cplusplus
}
#endif
#endif /* LINEBREAK_H */

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1 @@
/* The content of this file is generated from:

View File

@ -0,0 +1,7 @@
*/
#include "linebreak.h"
#include "linebreakdef.h"
/** Default line breaking properties as from the Unicode Web site. */
struct LineBreakProperties lb_prop_default[] = {

View File

@ -0,0 +1,2 @@
{ 0xFFFFFFFF, 0xFFFFFFFF, LBP_Undefined }
};

View File

@ -0,0 +1,139 @@
/* vim: set tabstop=4 shiftwidth=4: */
/*
* Line breaking in a Unicode sequence. Designed to be used in a
* generic text renderer.
*
* Copyright (C) 2008-2010 Wu Yongwei <wuyongwei at gmail dot com>
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the author be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute
* it freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must
* not claim that you wrote the original software. If you use this
* software in a product, an acknowledgement in the product
* documentation would be appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must
* not be misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source
* distribution.
*
* The main reference is Unicode Standard Annex 14 (UAX #14):
* <URL:http://www.unicode.org/reports/tr14/>
*
* When this library was designed, this annex was at Revision 19, for
* Unicode 5.0.0:
* <URL:http://www.unicode.org/reports/tr14/tr14-19.html>
*
* This library has been updated according to Revision 24, for
* Unicode 5.2.0:
* <URL:http://www.unicode.org/reports/tr14/tr14-24.html>
*
* The Unicode Terms of Use are available at
* <URL:http://www.unicode.org/copyright.html>
*/
/**
* @file linebreakdef.c
*
* Definition of language-specific data.
*
* @version 2.0, 2010/01/03
* @author Wu Yongwei
*/
#include "linebreak.h"
#include "linebreakdef.h"
/**
* English-specifc data over the default Unicode rules.
*/
static struct LineBreakProperties lb_prop_English[] = {
{ 0x2018, 0x2018, LBP_OP }, /* Left single quotation mark: opening */
{ 0x201C, 0x201C, LBP_OP }, /* Left double quotation mark: opening */
{ 0x201D, 0x201D, LBP_CL }, /* Right double quotation mark: closing */
{ 0, 0, LBP_Undefined }
};
/**
* German-specifc data over the default Unicode rules.
*/
static struct LineBreakProperties lb_prop_German[] = {
{ 0x00AB, 0x00AB, LBP_CL }, /* Left double angle quotation mark: closing */
{ 0x00BB, 0x00BB, LBP_OP }, /* Right double angle quotation mark: opening */
{ 0x2018, 0x2018, LBP_CL }, /* Left single quotation mark: closing */
{ 0x201C, 0x201C, LBP_CL }, /* Left double quotation mark: closing */
{ 0x2039, 0x2039, LBP_CL }, /* Left single angle quotation mark: closing */
{ 0x203A, 0x203A, LBP_OP }, /* Right single angle quotation mark: opening */
{ 0, 0, LBP_Undefined }
};
/**
* Spanish-specifc data over the default Unicode rules.
*/
static struct LineBreakProperties lb_prop_Spanish[] = {
{ 0x00AB, 0x00AB, LBP_OP }, /* Left double angle quotation mark: opening */
{ 0x00BB, 0x00BB, LBP_CL }, /* Right double angle quotation mark: closing */
{ 0x2018, 0x2018, LBP_OP }, /* Left single quotation mark: opening */
{ 0x201C, 0x201C, LBP_OP }, /* Left double quotation mark: opening */
{ 0x201D, 0x201D, LBP_CL }, /* Right double quotation mark: closing */
{ 0x2039, 0x2039, LBP_OP }, /* Left single angle quotation mark: opening */
{ 0x203A, 0x203A, LBP_CL }, /* Right single angle quotation mark: closing */
{ 0, 0, LBP_Undefined }
};
/**
* French-specifc data over the default Unicode rules.
*/
static struct LineBreakProperties lb_prop_French[] = {
{ 0x00AB, 0x00AB, LBP_OP }, /* Left double angle quotation mark: opening */
{ 0x00BB, 0x00BB, LBP_CL }, /* Right double angle quotation mark: closing */
{ 0x2018, 0x2018, LBP_OP }, /* Left single quotation mark: opening */
{ 0x201C, 0x201C, LBP_OP }, /* Left double quotation mark: opening */
{ 0x201D, 0x201D, LBP_CL }, /* Right double quotation mark: closing */
{ 0x2039, 0x2039, LBP_OP }, /* Left single angle quotation mark: opening */
{ 0x203A, 0x203A, LBP_CL }, /* Right single angle quotation mark: closing */
{ 0, 0, LBP_Undefined }
};
/**
* Russian-specifc data over the default Unicode rules.
*/
static struct LineBreakProperties lb_prop_Russian[] = {
{ 0x00AB, 0x00AB, LBP_OP }, /* Left double angle quotation mark: opening */
{ 0x00BB, 0x00BB, LBP_CL }, /* Right double angle quotation mark: closing */
{ 0x201C, 0x201C, LBP_CL }, /* Left double quotation mark: closing */
{ 0, 0, LBP_Undefined }
};
/**
* Chinese-specifc data over the default Unicode rules.
*/
static struct LineBreakProperties lb_prop_Chinese[] = {
{ 0x2018, 0x2018, LBP_OP }, /* Left single quotation mark: opening */
{ 0x2019, 0x2019, LBP_CL }, /* Right single quotation mark: closing */
{ 0x201C, 0x201C, LBP_OP }, /* Left double quotation mark: opening */
{ 0x201D, 0x201D, LBP_CL }, /* Right double quotation mark: closing */
{ 0, 0, LBP_Undefined }
};
/**
* Association data of language-specific line breaking properties with
* language names. This is the definition for the static data in this
* file. If you want more flexibility, or do not need the data here,
* you may want to redefine \e lb_prop_lang_map in your C source file.
*/
struct LineBreakPropertiesLang lb_prop_lang_map[] = {
{ "en", 2, lb_prop_English },
{ "de", 2, lb_prop_German },
{ "es", 2, lb_prop_Spanish },
{ "fr", 2, lb_prop_French },
{ "ru", 2, lb_prop_Russian },
{ "zh", 2, lb_prop_Chinese },
{ NULL, 0, NULL }
};

View File

@ -0,0 +1,149 @@
/* vim: set tabstop=4 shiftwidth=4: */
/*
* Line breaking in a Unicode sequence. Designed to be used in a
* generic text renderer.
*
* Copyright (C) 2008-2010 Wu Yongwei <wuyongwei at gmail dot com>
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the author be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute
* it freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must
* not claim that you wrote the original software. If you use this
* software in a product, an acknowledgement in the product
* documentation would be appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must
* not be misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source
* distribution.
*
* The main reference is Unicode Standard Annex 14 (UAX #14):
* <URL:http://www.unicode.org/reports/tr14/>
*
* When this library was designed, this annex was at Revision 19, for
* Unicode 5.0.0:
* <URL:http://www.unicode.org/reports/tr14/tr14-19.html>
*
* This library has been updated according to Revision 24, for
* Unicode 5.2.0:
* <URL:http://www.unicode.org/reports/tr14/tr14-24.html>
*
* The Unicode Terms of Use are available at
* <URL:http://www.unicode.org/copyright.html>
*/
/**
* @file linebreakdef.h
*
* Definitions of internal data structures, declarations of global
* variables, and function prototypes for the line breaking algorithm.
*
* @version 2.0, 2010/01/03
* @author Wu Yongwei
*/
/**
* Constant value to mark the end of string. It is not a valid Unicode
* character.
*/
#define EOS 0xFFFF
/**
* Line break classes. This is a direct mapping of Table 1 of Unicode
* Standard Annex 14, Revision 19.
*/
enum LineBreakClass
{
/* This is used to signal an error condition. */
LBP_Undefined, /**< Undefined */
/* The following break classes are treated in the pair table. */
LBP_OP, /**< Opening punctuation */
LBP_CL, /**< Closing punctuation */
LBP_CP, /**< Closing parenthesis */
LBP_QU, /**< Ambiguous quotation */
LBP_GL, /**< Glue */
LBP_NS, /**< Non-starters */
LBP_EX, /**< Exclamation/Interrogation */
LBP_SY, /**< Symbols allowing break after */
LBP_IS, /**< Infix separator */
LBP_PR, /**< Prefix */
LBP_PO, /**< Postfix */
LBP_NU, /**< Numeric */
LBP_AL, /**< Alphabetic */
LBP_ID, /**< Ideographic */
LBP_IN, /**< Inseparable characters */
LBP_HY, /**< Hyphen */
LBP_BA, /**< Break after */
LBP_BB, /**< Break before */
LBP_B2, /**< Break on either side (but not pair) */
LBP_ZW, /**< Zero-width space */
LBP_CM, /**< Combining marks */
LBP_WJ, /**< Word joiner */
LBP_H2, /**< Hangul LV */
LBP_H3, /**< Hangul LVT */
LBP_JL, /**< Hangul L Jamo */
LBP_JV, /**< Hangul V Jamo */
LBP_JT, /**< Hangul T Jamo */
/* The following break classes are not treated in the pair table */
LBP_AI, /**< Ambiguous (alphabetic or ideograph) */
LBP_BK, /**< Break (mandatory) */
LBP_CB, /**< Contingent break */
LBP_CR, /**< Carriage return */
LBP_LF, /**< Line feed */
LBP_NL, /**< Next line */
LBP_SA, /**< South-East Asian */
LBP_SG, /**< Surrogates */
LBP_SP, /**< Space */
LBP_XX /**< Unknown */
};
/**
* Struct for entries of line break properties. The array of the
* entries \e must be sorted.
*/
struct LineBreakProperties
{
utf32_t start; /**< Starting coding point */
utf32_t end; /**< End coding point */
enum LineBreakClass prop; /**< The line breaking property */
};
/**
* Struct for association of language-specific line breaking properties
* with language names.
*/
struct LineBreakPropertiesLang
{
const char *lang; /**< Language name */
size_t namelen; /**< Length of name to match */
struct LineBreakProperties *lbp; /**< Pointer to associated data */
};
/**
* Abstract function interface for #lb_get_next_char_utf8,
* #lb_get_next_char_utf16, and #lb_get_next_char_utf32.
*/
typedef utf32_t (*get_next_char_t)(const void *, size_t, size_t *);
/* Declarations */
extern struct LineBreakProperties lb_prop_default[];
extern struct LineBreakPropertiesLang lb_prop_lang_map[];
/* Function Prototype */
utf32_t lb_get_next_char_utf8(const utf8_t *s, size_t len, size_t *ip);
utf32_t lb_get_next_char_utf16(const utf16_t *s, size_t len, size_t *ip);
utf32_t lb_get_next_char_utf32(const utf32_t *s, size_t len, size_t *ip);
void set_linebreaks(
const void *s,
size_t len,
const char *lang,
char *brks,
get_next_char_t get_next_char);