forked from enlightenment/efl
Evas: Added liblinebreak (static dep) to the build process.
SVN revision: 59203
This commit is contained in:
parent
e307051ccb
commit
bf909af0f3
|
@ -272,6 +272,33 @@ if test "x${want_fontconfig}" = "xyes" -o "x${want_fontconfig}" = "xauto" ; then
|
||||||
])
|
])
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# linebreak
|
||||||
|
have_linebreak="no"
|
||||||
|
want_linebreak="yes"
|
||||||
|
AC_ARG_ENABLE([liblinebreak],
|
||||||
|
AC_HELP_STRING([--disable-liblinebreak],
|
||||||
|
[disable linking against liblinebreak. @<:@default=enabled@:>@]),
|
||||||
|
[
|
||||||
|
if test "x${enableval}" = "xyes" ; then
|
||||||
|
want_linebreak="yes"
|
||||||
|
else
|
||||||
|
want_linebreak="no"
|
||||||
|
fi
|
||||||
|
])
|
||||||
|
|
||||||
|
|
||||||
|
AM_CONDITIONAL(EVAS_USE_LINEBREAK, test "x${want_linebreak}" = "xyes")
|
||||||
|
if test "x${want_linebreak}" = "xyes" ; then
|
||||||
|
have_linebreak="yes"
|
||||||
|
LINEBREAK_CFLAGS='-I$(top_srcdir)/src/static_deps/liblinebreak'
|
||||||
|
LINEBREAK_LIBS='$(top_builddir)/src/static_deps/liblinebreak/liblinebreak.la'
|
||||||
|
AC_SUBST(LINEBREAK_CFLAGS)
|
||||||
|
AC_SUBST(LINEBREAK_LIBS)
|
||||||
|
AC_DEFINE(HAVE_LINEBREAK, 1, [have liblinebreak support])
|
||||||
|
else
|
||||||
|
have_linebreak="no"
|
||||||
|
fi
|
||||||
|
|
||||||
# fribidi support
|
# fribidi support
|
||||||
have_fribidi="no"
|
have_fribidi="no"
|
||||||
AC_ARG_ENABLE([fribidi],
|
AC_ARG_ENABLE([fribidi],
|
||||||
|
@ -1645,6 +1672,8 @@ src/modules/savers/eet/Makefile
|
||||||
src/modules/savers/jpeg/Makefile
|
src/modules/savers/jpeg/Makefile
|
||||||
src/modules/savers/png/Makefile
|
src/modules/savers/png/Makefile
|
||||||
src/modules/savers/tiff/Makefile
|
src/modules/savers/tiff/Makefile
|
||||||
|
src/static_deps/Makefile
|
||||||
|
src/static_deps/liblinebreak/Makefile
|
||||||
src/lib/include/Makefile
|
src/lib/include/Makefile
|
||||||
src/examples/Makefile
|
src/examples/Makefile
|
||||||
README
|
README
|
||||||
|
@ -1742,6 +1771,7 @@ echo
|
||||||
echo "Font Rendering Helpers:"
|
echo "Font Rendering Helpers:"
|
||||||
echo " Fribidi.................: $have_fribidi"
|
echo " Fribidi.................: $have_fribidi"
|
||||||
echo " Harfbuzz................: $have_harfbuzz"
|
echo " Harfbuzz................: $have_harfbuzz"
|
||||||
|
echo " liblinebreak............: $have_linebreak"
|
||||||
# FIXME: add non freetype2 font engine support
|
# FIXME: add non freetype2 font engine support
|
||||||
# FIXME: make freetype2 optional
|
# FIXME: make freetype2 optional
|
||||||
echo
|
echo
|
||||||
|
|
|
@ -1,3 +1,3 @@
|
||||||
MAINTAINERCLEANFILES = Makefile.in
|
MAINTAINERCLEANFILES = Makefile.in
|
||||||
|
|
||||||
SUBDIRS = lib bin modules examples
|
SUBDIRS = static_deps lib bin modules examples
|
||||||
|
|
|
@ -0,0 +1,6 @@
|
||||||
|
Wu Yongwei. Designed and implemented liblinebreak.
|
||||||
|
|
||||||
|
Nikolay Pultsin. Put forward the original requirements on liblinebreak,
|
||||||
|
performed tests, and made a lot of suggestions on the initial versions.
|
||||||
|
|
||||||
|
Thomas Klausner. Autoconfiscated and libtoolized liblinebreak.
|
|
@ -0,0 +1,397 @@
|
||||||
|
2010-01-03 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
* LICENCE: Update the copyright year.
|
||||||
|
|
||||||
|
2010-01-03 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
* NEWS: Add information about the 2.0 release.
|
||||||
|
|
||||||
|
2010-01-03 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
* Doxyfile (PROJECT_NUMBER): Set to `2.0'.
|
||||||
|
(HAVE_DOT): Set to `YES'.
|
||||||
|
|
||||||
|
2010-01-03 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
* linebreak.c: Update the version number in comment to 2.0.
|
||||||
|
* linebreak.h: Ditto.
|
||||||
|
* linebreakdef.c: Ditto.
|
||||||
|
* linebreakdef.h: Ditto.
|
||||||
|
|
||||||
|
2009-12-17 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
Change the values of enum BreakAction to the same length.
|
||||||
|
* linebreak.c (DIRECT_BRK): Rename to DIR_BRK.
|
||||||
|
(INDIRECT_BRK): Rename to IND_BRK.
|
||||||
|
(CM_INDIRECT_BRK): Rename to CMI_BRK.
|
||||||
|
(CM_PROHIBITED_BRK): Rename to CMP_BRK.
|
||||||
|
(PROHIBITED_BRK): Rename to PRH_BRK.
|
||||||
|
|
||||||
|
2009-11-29 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
* Doxyfile (TAB_SIZE): Set to the correct size `4', as used in the
|
||||||
|
source files.
|
||||||
|
|
||||||
|
2009-11-29 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
Update files according to UAX #14-24, for Unicode 5.2.0.
|
||||||
|
* linebreak.c: Update comments about UAX #14.
|
||||||
|
* linebreak.h: Ditto.
|
||||||
|
* linebreakdef.c: Ditto.
|
||||||
|
* linebreakdef.h: Ditto.
|
||||||
|
(LBP_CP): New enumerator for the new `CP' class as defined in
|
||||||
|
UAX #14-24.
|
||||||
|
* linebreak.c (baTable): Update for the new class `CP'.
|
||||||
|
* linebreakdata.c: Regenerate from LineBreak-5.2.0.txt.
|
||||||
|
* README: Update the reference to UAX #14-24, for Unicode 5.2.0.
|
||||||
|
|
||||||
|
2009-05-03 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
* NEWS: Add information about the 1.2 release.
|
||||||
|
|
||||||
|
2009-04-30 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
Optimize the Doxygen output.
|
||||||
|
* linebreak.c (lb_prop_index): Adjust its definition format
|
||||||
|
slightly.
|
||||||
|
|
||||||
|
2009-04-30 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
* Doxyfile (USE_WINDOWS_ENCODING): Remove obsolete tag.
|
||||||
|
(DETAILS_AT_TOP): Ditto.
|
||||||
|
(MAX_DOT_GRAPH_WIDTH): Ditto.
|
||||||
|
(MAX_DOT_GRAPH_HEIGHT): Ditto.
|
||||||
|
(REFERENCED_BY_RELATION): Set to `NO'.
|
||||||
|
(REFERENCES_RELATION): Ditto.
|
||||||
|
(EXCLUDE): Add `filter_dup.c'.
|
||||||
|
|
||||||
|
2009-04-28 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
* linebreak.c (lb_get_next_char_utf8): Fix the issue that the index
|
||||||
|
can point to the middle of a UTF-8 sequence if End of String (EOS)
|
||||||
|
is encountered prematurely (thanks to Nikolay Pultsin and Rick Xu).
|
||||||
|
(lb_get_next_char_utf16): Fix the issue that the index can point to
|
||||||
|
the middle of a UTF-16 surrogate pair if EOS is encountered
|
||||||
|
prematurely.
|
||||||
|
|
||||||
|
2009-04-20 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
* linebreakdef.c (lb_prop_English): Remove the specialization of
|
||||||
|
right single quotation mark as closing punctuation mark, because it
|
||||||
|
can be used as apostrophe.
|
||||||
|
(lb_prop_Spanish): Ditto.
|
||||||
|
(lb_prop_French): Ditto.
|
||||||
|
|
||||||
|
2009-04-09 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
* Makefile.msvc: Make the `clean' target work on MSVC versions other
|
||||||
|
than 6.0; do not use precompiled header.
|
||||||
|
|
||||||
|
2009-03-07 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
* linebreak.h: Correct the wrong date in the documentation comment.
|
||||||
|
* linebreakdef.h: Ditto.
|
||||||
|
|
||||||
|
2009-02-10 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
* configure.ac (AC_INIT): Increase the version to 2.0.
|
||||||
|
* Makefile.am (liblinebreak_la_LDFLAGS): Set the version-info to
|
||||||
|
`2:0'.
|
||||||
|
|
||||||
|
2009-02-10 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
* linebreak.h (LINEBREAK_VERSION): New macro.
|
||||||
|
(linebreak_version): New global constant declaration.
|
||||||
|
* linebreak.c (linebreak_version): New global constant definition.
|
||||||
|
|
||||||
|
2009-02-10 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
Reduce namespace pollution.
|
||||||
|
* linebreak.c (get_lb_prop_lang): Mark as static.
|
||||||
|
(get_next_char_utf8): Rename to lb_get_next_char_utf8.
|
||||||
|
(get_next_char_utf16): Rename to lb_get_next_char_utf32.
|
||||||
|
(get_next_char_utf32): Rename to lb_get_next_char_utf32.
|
||||||
|
(is_breakable): Rename to is_line_breakable.
|
||||||
|
* linebreak.h (get_next_char_utf8): Remove the function prototype
|
||||||
|
declaration.
|
||||||
|
(get_next_char_utf16): Ditto.
|
||||||
|
(get_next_char_utf32): Ditto.
|
||||||
|
(is_breakable): Rename to is_line_breakable.
|
||||||
|
* linebreakdef.h (lb_get_next_char_utf8): Add the function prototype
|
||||||
|
declaration.
|
||||||
|
(lb_get_next_char_utf16): Ditto.
|
||||||
|
(lb_get_next_char_utf32): Ditto.
|
||||||
|
|
||||||
|
2009-02-06 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
* NEWS: Add information about the 1.1 release.
|
||||||
|
|
||||||
|
2009-01-02 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
* Makefile.am (EXTRA_DIST): Add the missing `LICENCE' file.
|
||||||
|
|
||||||
|
2008-12-31 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
* linebreak.c: Update the version number in comment to 1.0.
|
||||||
|
* linebreak.h: Ditto.
|
||||||
|
* linebreakdef.c: Ditto.
|
||||||
|
* linebreakdef.h: Ditto.
|
||||||
|
|
||||||
|
2008-12-31 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
* NEWS: Update for the 1.0 release.
|
||||||
|
|
||||||
|
2008-12-31 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
* README: Correct two typos.
|
||||||
|
|
||||||
|
2008-12-31 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
* README: Add the online URL reference.
|
||||||
|
|
||||||
|
2008-12-30 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
* README: Update the reference to UAX #14-22, for Unicode 5.1.0.
|
||||||
|
|
||||||
|
2008-12-13 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
Update files according to UAX #14-22, for Unicode 5.1.0.
|
||||||
|
* linebreak.c (baTable): Update according to Table 2 of UAX #14-22.
|
||||||
|
* linebreakdef.c (lb_prop_Spanish): Remove the unnecessary
|
||||||
|
customization for inverted marks in Spanish.
|
||||||
|
* linebreakdata.c: Regenerate from LineBreak-5.1.0.txt.
|
||||||
|
* linebreak.h: Update comment only.
|
||||||
|
* linebreakdef.h: Ditto.
|
||||||
|
|
||||||
|
2008-12-12 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
* README: Update for the new build methods and better readability.
|
||||||
|
|
||||||
|
2008-12-12 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
* Makefile.msvc: Correct the inconsistent naming in the output
|
||||||
|
message.
|
||||||
|
|
||||||
|
2008-12-12 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
* configure.ac (AM_INIT_AUTOMAKE): Mark `foreign'.
|
||||||
|
* bootstrap: New file.
|
||||||
|
* purge: New file.
|
||||||
|
* Makefile.gcc (purge): Remove this target.
|
||||||
|
|
||||||
|
2008-12-10 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
* NEWS: New file.
|
||||||
|
|
||||||
|
2008-12-10 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
* AUTHORS: New file.
|
||||||
|
|
||||||
|
2008-12-10 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
* Makefile.gcc (purge): New phony target to purge files generated by
|
||||||
|
autoconfiscation.
|
||||||
|
|
||||||
|
2008-12-10 Thomas Klausner <tk@giga.or.at>
|
||||||
|
|
||||||
|
* configure.ac: New file.
|
||||||
|
* Makefile.am: New file.
|
||||||
|
|
||||||
|
2008-12-10 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
* Doxyfile (OUTPUT_DIRECTORY): Set to `doc'.
|
||||||
|
(ALPHABETICAL_INDEX): Set to `YES'.
|
||||||
|
|
||||||
|
2008-12-09 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
* Makefile.msvc: New file.
|
||||||
|
|
||||||
|
2008-12-09 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
* Makefile: Remove (to become Makefile.gcc).
|
||||||
|
* Makefile.gcc: New file (was Makefile).
|
||||||
|
|
||||||
|
2008-12-07 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
* linebreak.c: Adjust the comment that refers to Unicode Annex 14.
|
||||||
|
* linebreak.h: Ditto.
|
||||||
|
* linebreakdef.c: Ditto.
|
||||||
|
* linebreakdef.h: Ditto.
|
||||||
|
|
||||||
|
2008-12-07 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
Use only POSIX basic regexp to ensure maximum portability (issues
|
||||||
|
have been found on Mac OS X, where GNU extensions do not work).
|
||||||
|
* LineBreak1.sed: Replace `[:xdigit:]' with `0-9A-F', and `\+' with
|
||||||
|
`\{1,\}'.
|
||||||
|
* LineBreak2.sed: Ditto.
|
||||||
|
|
||||||
|
2008-12-07 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
* Makefile: Replace `*.exe' with `filter_dup$(EXEEXT)', since the
|
||||||
|
extension `.exe' is specific to Windows.
|
||||||
|
|
||||||
|
2008-04-20 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
Add README and LICENCE files, as well as a Doxyfile to generate
|
||||||
|
documents.
|
||||||
|
* README: New file.
|
||||||
|
* LICENCE: New file.
|
||||||
|
* Doxyfile: New file.
|
||||||
|
* Makefile (doc): Add new phony target.
|
||||||
|
|
||||||
|
2008-04-04 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
Remove the English override for plus sign: it is better treated in
|
||||||
|
the text breaking program (see ../breaktext/ for an example).
|
||||||
|
* linebreakdef.c (lb_prop_English): Remove the line for plus sign.
|
||||||
|
|
||||||
|
2008-03-29 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
* Makefile: Correct the dependency-making rules when OLDGCC=Y.
|
||||||
|
|
||||||
|
2008-03-23 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
* Makefile (clean): Do not remove *.exe and tags here.
|
||||||
|
(distclean): Remove *.exe and tags.
|
||||||
|
|
||||||
|
2008-03-23 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
Remove the English override for solidus: it is better treated in the
|
||||||
|
text breaking program (see ../breaktext/ for an example).
|
||||||
|
* linebreakdef.c (lb_prop_English): Remove the line for solidus.
|
||||||
|
|
||||||
|
2008-03-16 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
Rename init_linebreak_prop_index to init_linebreak for future
|
||||||
|
safety; make visible certain functions that are potentially useful.
|
||||||
|
* linebreak.c (init_linebreak_prop_index): Rename to init_linebreak.
|
||||||
|
(get_next_char_t): Move to linebreakdef.h.
|
||||||
|
(get_next_char_utf8): Make non-static.
|
||||||
|
(get_next_char_utf16): Ditto.
|
||||||
|
(get_next_char_utf32): Ditto.
|
||||||
|
(set_linebreaks): Ditto.
|
||||||
|
* linebreak.h (init_linebreak_prop_index): Rename to init_linebreak.
|
||||||
|
(get_next_char_utf8): Add the function prototype.
|
||||||
|
(get_next_char_utf16): Ditto.
|
||||||
|
(get_next_char_utf32): Ditto.
|
||||||
|
* linebreakdef.h (get_next_char_t): Add the typedef.
|
||||||
|
(set_linebreaks): Add the function prototype.
|
||||||
|
|
||||||
|
2008-03-16 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
* Makefile (OLDGCC): Add support for GCC 2.95.3 (when OLDGCC=Y).
|
||||||
|
|
||||||
|
2008-03-15 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
* linebreak.c (set_linebreaks): Fix a bug that `==' was wrongly used
|
||||||
|
for `='.
|
||||||
|
|
||||||
|
2008-03-05 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
Improve the performance by reducing the look-ups of the
|
||||||
|
language-specific line breaking properties array from the language
|
||||||
|
name (thanks to Nikolay Pultsin).
|
||||||
|
* linebreak.c (get_lb_prop_lang): New function.
|
||||||
|
(get_char_lb_class_lang): Change the second parameter from the
|
||||||
|
language name to the line breaking properties array.
|
||||||
|
(set_linebreaks): Look up the language-specific line breaking
|
||||||
|
properties array from the language name only once in one function
|
||||||
|
call.
|
||||||
|
|
||||||
|
2008-03-03 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
Make minor adjustments in code and comments.
|
||||||
|
* linebreak.c: Adjust the doc comments.
|
||||||
|
(init_linebreak_prop_index): Modify a conditional to make it more
|
||||||
|
robust and consistent.
|
||||||
|
* linebreakdef.c (lb_prop_lang_map): Replace the pointer
|
||||||
|
lb_prop_default with NULL, since the value is never used.
|
||||||
|
|
||||||
|
2008-03-03 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
Accelerate get_char_lb_class for invalid Unicode code points.
|
||||||
|
* linebreak.c (get_char_lb_class): Adjust the conditionals so that
|
||||||
|
getting the line breaking class for an invalid code point is much
|
||||||
|
faster, which requires the array of line breaking properties be
|
||||||
|
sorted.
|
||||||
|
* linebreakdef.h: Adjust a comment that the array of line break
|
||||||
|
properties must be sorted.
|
||||||
|
|
||||||
|
2008-03-02 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
Change the values of enum BreakAction to more complete forms.
|
||||||
|
* linebreak.c (INDRCT_BRK): Rename to INDIRECT_BRK.
|
||||||
|
(CM_INDRCT_BRK): Rename to CM_INDIRECT_BRK.
|
||||||
|
(CM_PROHIBTD_BRK): Rename to CM_PROHIBITED_BRK.
|
||||||
|
(PROHIBTD_BRK): Rename to PROHIBITED_BRK.
|
||||||
|
|
||||||
|
2008-03-02 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
Implement a two-stage search in get_char_lb_class_default to
|
||||||
|
accelerate the overall performance, especially for non-Latin
|
||||||
|
languages.
|
||||||
|
* linebreak.c (LINEBREAK_INDEX_SIZE): New constant macro.
|
||||||
|
(struct LineBreakPropertiesIndex): New struct.
|
||||||
|
(lb_prop_index): New static variable.
|
||||||
|
(init_linebreak_prop_index): New function.
|
||||||
|
(get_char_lb_class_default): New function.
|
||||||
|
(get_char_lb_class_lang): Use get_char_lb_class_default.
|
||||||
|
* linebreak.h: Detect C++ and add extern "C" guard if necessary.
|
||||||
|
(init_linebreak_prop_index): Add the prototype declaration.
|
||||||
|
* linebreakdef.h: Adjust a comment.
|
||||||
|
|
||||||
|
2008-03-02 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
Split/refactor the code; add (doc) comments.
|
||||||
|
* Makefile (CFILES): Add linebreakdata.c and linebreakdef.c.
|
||||||
|
* linebreak.c: Add and adjust comments.
|
||||||
|
(linebreakdef.h): Add include file.
|
||||||
|
(linebreakdata.c): Remove include file.
|
||||||
|
(EOS): Remove (now in linebreakdef.h).
|
||||||
|
(enum LineBreakClass): Ditto.
|
||||||
|
(struct LineBreakProperties): Ditto.
|
||||||
|
(lbpEnglish): Remove (now in linebreakdef.c as lb_prop_English).
|
||||||
|
(lbpGerman): Remove (now in linebreakdef.c as lb_prop_German).
|
||||||
|
(lbpSpanish): Remove (now in linebreakdef.c as lb_prop_Spanish).
|
||||||
|
(lbpFrench): Remove (now in linebreakdef.c as lb_prop_French).
|
||||||
|
(lbpRussian): Remove (now in linebreakdef.c as lb_prop_Russian).
|
||||||
|
(lbpChinese): Remove (now in linebreakdef.c as lb_prop_Chinese).
|
||||||
|
(struct LineBreakPropertiesLang): Remove (now in linebreakdef.h).
|
||||||
|
(lbpLangs): Remove (now in linebreakdef.c as lb_prop_lang_map).
|
||||||
|
(get_next_char_utf16): Make sure memory access not go beyond len.
|
||||||
|
* linebreak.h: Add copyright information and adjust comments.
|
||||||
|
(stddef.h): Add include file.
|
||||||
|
* linebreakdata.c (linebreak.h): Add include file.
|
||||||
|
(linebreakdef.h): Add include file.
|
||||||
|
(lbpDefault): Make global and rename to lb_prop_default.
|
||||||
|
* linebreakdata2.tmpl: Add two include files, a comment line, and
|
||||||
|
remove `static'.
|
||||||
|
* linebreakdef.c: New file.
|
||||||
|
* linebreakdef.h: New file.
|
||||||
|
|
||||||
|
2008-02-26 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
* linebreak.c (lbpSpanish): New array for Spanish-specific data.
|
||||||
|
(lbpLangs): Update the index array for Spanish.
|
||||||
|
(resolve_lb_class): Resolve AmbIguous class to IDeographic in
|
||||||
|
Chinese, Japanese, and Korean.
|
||||||
|
|
||||||
|
2008-02-26 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
* Makefile (LineBreak.txt): Add new rule to retrieve it from the Web
|
||||||
|
if it is not already there.
|
||||||
|
|
||||||
|
2008-02-23 Wu Yongwei <wuyongwei@gmail.com>
|
||||||
|
|
||||||
|
Add files for linebreak.
|
||||||
|
* LineBreak1.sed: New file.
|
||||||
|
* LineBreak2.sed: New file.
|
||||||
|
* Makefile: New file.
|
||||||
|
* filter_dup.c: New file.
|
||||||
|
* linebreak.c: New file.
|
||||||
|
* linebreak.h: New file.
|
||||||
|
* linebreakdata.c: New file.
|
||||||
|
* linebreakdata1.tmpl: New file.
|
||||||
|
* linebreakdata2.tmpl: New file.
|
||||||
|
* linebreakdata3.tmpl: New file.
|
|
@ -0,0 +1,18 @@
|
||||||
|
Copyright (C) 2008-2010 Wu Yongwei <wuyongwei at gmail dot com>
|
||||||
|
|
||||||
|
This software is provided 'as-is', without any express or implied
|
||||||
|
warranty. In no event will the author be held liable for any damages
|
||||||
|
arising from the use of this software.
|
||||||
|
|
||||||
|
Permission is granted to anyone to use this software for any purpose,
|
||||||
|
including commercial applications, and to alter it and redistribute it
|
||||||
|
freely, subject to the following restrictions:
|
||||||
|
|
||||||
|
1. The origin of this software must not be misrepresented; you must not
|
||||||
|
claim that you wrote the original software. If you use this software
|
||||||
|
in a product, an acknowledgement in the product documentation would
|
||||||
|
be appreciated but is not required.
|
||||||
|
2. Altered source versions must be plainly marked as such, and must not
|
||||||
|
be misrepresented as being the original software.
|
||||||
|
3. This notice may not be removed or altered from any source
|
||||||
|
distribution.
|
|
@ -0,0 +1 @@
|
||||||
|
s/\(^[0-9A-F.]\{1,\};[A-Z][A-Z0-9]\) #.*/\1/p
|
|
@ -0,0 +1,2 @@
|
||||||
|
s/^\([0-9A-F]\{1,\}\);/\1..\1;/
|
||||||
|
s/^\([0-9A-F]\{1,\}\)\.\.\([0-9A-F]\{1,\}\);\([A-Z][A-Z0-9]\)/ { 0x\1, 0x\2, LBP_\3 },/
|
|
@ -0,0 +1,16 @@
|
||||||
|
#noinst_PROGRAMS = filter_dup
|
||||||
|
include_HEADERS = linebreak.h linebreakdef.h
|
||||||
|
noinst_LTLIBRARIES = liblinebreak.la
|
||||||
|
|
||||||
|
liblinebreak_la_SOURCES = \
|
||||||
|
linebreak.c \
|
||||||
|
linebreakdata.c \
|
||||||
|
linebreakdef.c
|
||||||
|
|
||||||
|
EXTRA_DIST = \
|
||||||
|
LineBreak1.sed \
|
||||||
|
LineBreak2.sed \
|
||||||
|
linebreakdata1.tmpl \
|
||||||
|
linebreakdata2.tmpl \
|
||||||
|
linebreakdata3.tmpl \
|
||||||
|
LICENCE
|
|
@ -0,0 +1,37 @@
|
||||||
|
New in 2.0
|
||||||
|
|
||||||
|
- Update the algorithm and data according to UAX #14-24 and
|
||||||
|
LineBreak-5.2.0.txt
|
||||||
|
- Rename some functions to reduce namespace pollution
|
||||||
|
- Make Doxygen documentation better
|
||||||
|
|
||||||
|
New in 1.2
|
||||||
|
|
||||||
|
- Fix the bug that an assertion in code can fail if an invalid UTF-8 or
|
||||||
|
UTF-16 sequence is encountered near the end of input
|
||||||
|
- Remove the specialization of right single quotation mark as closing
|
||||||
|
punctuation mark in English, French, and Spanish, because it can be
|
||||||
|
used as apostrophe
|
||||||
|
- Make Doxygen documentation better
|
||||||
|
|
||||||
|
New in 1.1
|
||||||
|
|
||||||
|
- Make get_lb_prop_lang static and not an exported symbol
|
||||||
|
- Define is_line_breakable to alias to is_breakable
|
||||||
|
- Declare get_next_char_utf* will be changed to lb_get_next_char_utf*
|
||||||
|
- Move the declarations of get_next_char_utf* from linebreak.h to
|
||||||
|
linebreakdef.h
|
||||||
|
- Add the function documentation comments to the header files
|
||||||
|
|
||||||
|
New in 1.0
|
||||||
|
|
||||||
|
- Update the line breaking data according to UAX #14-22 and
|
||||||
|
LineBreak-5.1.0.txt
|
||||||
|
- Add autoconfiscation support (./configure, make, make install)
|
||||||
|
- Add Makefile for MSVC
|
||||||
|
|
||||||
|
First public release (0.9.6, or 20080421)
|
||||||
|
|
||||||
|
- Implement line breaking algorithm according to UAX #14-19
|
||||||
|
- Line breaking data is generated from LineBreak-5.0.0.txt
|
||||||
|
- Makefile only supports GCC
|
|
@ -0,0 +1,86 @@
|
||||||
|
NOTICE: This is the original version, that was adapted a bit (mostly
|
||||||
|
build related) in order to work nicely with Evas.
|
||||||
|
|
||||||
|
|
||||||
|
L I B L I N E B R E A K
|
||||||
|
=======================
|
||||||
|
|
||||||
|
Overview
|
||||||
|
--------
|
||||||
|
|
||||||
|
This is the README file for liblinebreak, an implementation of the line
|
||||||
|
breaking algorithm as described in Unicode 5.2.0 Standard Annex 14,
|
||||||
|
Revision 24, available at
|
||||||
|
<URL:http://www.unicode.org/reports/tr14/tr14-24.html>
|
||||||
|
|
||||||
|
Check this URL for up-to-date information:
|
||||||
|
<URL:http://vimgadgets.sourceforge.net/liblinebreak/>
|
||||||
|
|
||||||
|
|
||||||
|
Licence
|
||||||
|
-------
|
||||||
|
|
||||||
|
This library is released under an open-source licence, the zlib/libpng
|
||||||
|
licence. Please check the file LICENCE for details.
|
||||||
|
|
||||||
|
Apart from using the algorithm, part of the code is derived from the
|
||||||
|
data provided under
|
||||||
|
<URL:http://www.unicode.org/Public/>
|
||||||
|
|
||||||
|
And the Unicode Terms of Use may apply:
|
||||||
|
<URL:http://www.unicode.org/copyright.html>
|
||||||
|
|
||||||
|
|
||||||
|
Installation
|
||||||
|
------------
|
||||||
|
|
||||||
|
There are three ways to build the library:
|
||||||
|
|
||||||
|
1) On *NIX systems supported by the autoconfiscation tools, do the
|
||||||
|
normal
|
||||||
|
|
||||||
|
./configure
|
||||||
|
make
|
||||||
|
sudo make install
|
||||||
|
|
||||||
|
to build and install both the dynamic and static libraries. In
|
||||||
|
addition, one may
|
||||||
|
|
||||||
|
- type `make doc' to generate the doxygen documentation; or
|
||||||
|
- type `make linebreakdata' to regenerate linebreakdata.c from
|
||||||
|
LineBreak.txt.
|
||||||
|
|
||||||
|
2) On systems where GCC and Binutils are supported, one can type
|
||||||
|
|
||||||
|
cp -p Makefile.gcc Makefile
|
||||||
|
make
|
||||||
|
|
||||||
|
to build the static library. In addition, one may
|
||||||
|
|
||||||
|
- type `make debug' or `make release' to explicitly generate the
|
||||||
|
debug or release build;
|
||||||
|
- type `make doc' to generate the doxygen documentation; or
|
||||||
|
- type `make linebreakdata' to regenerate linebreakdata.c from
|
||||||
|
LineBreak.txt.
|
||||||
|
|
||||||
|
3) On Windows, apart from using method 1 (Cygwin/MSYS) and method 2
|
||||||
|
(MinGW), MSVC can also be used. Type
|
||||||
|
|
||||||
|
nmake -f Makefile.msvc
|
||||||
|
|
||||||
|
to build the static library. By default the debug release is built.
|
||||||
|
To build the release version
|
||||||
|
|
||||||
|
nmake -f Makefile.msvc CFG="linebreak - Win32 Release"
|
||||||
|
|
||||||
|
|
||||||
|
Documentation
|
||||||
|
-------------
|
||||||
|
|
||||||
|
Check the generated document doc/html/linebreak_8h.html for the public
|
||||||
|
interfaces exposed to applications.
|
||||||
|
|
||||||
|
|
||||||
|
$Id: README,v 1.6 2009/11/29 08:09:13 adah Exp $
|
||||||
|
|
||||||
|
vim:autoindent:expandtab:formatoptions=tcqlmn:textwidth=72:
|
|
@ -0,0 +1,48 @@
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
int main()
|
||||||
|
{
|
||||||
|
char s[80];
|
||||||
|
char beg[16];
|
||||||
|
char end[16];
|
||||||
|
char prop[16];
|
||||||
|
char lastbeg[16];
|
||||||
|
char lastend[16];
|
||||||
|
char lastprop[16];
|
||||||
|
lastprop[0] = 0;
|
||||||
|
for (;;)
|
||||||
|
{
|
||||||
|
if (fgets(s, sizeof s, stdin) == NULL)
|
||||||
|
break;
|
||||||
|
if (strstr(s, "LBP_") == NULL || strstr(s, "LBP_Undef") != NULL)
|
||||||
|
{
|
||||||
|
if (lastprop[0])
|
||||||
|
{
|
||||||
|
printf("\t{ %s %s %s },\n", lastbeg, lastend, lastprop);
|
||||||
|
lastprop[0] = 0;
|
||||||
|
}
|
||||||
|
printf("%s", s);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
sscanf(s, "\t{ %s %s %s }", beg, end, prop);
|
||||||
|
/*printf("==>\t{ \"%s\" \"%s\" \"%s\" },\n", beg, end, prop);*/
|
||||||
|
if (lastprop[0] && strcmp(lastprop, prop) != 0)
|
||||||
|
{
|
||||||
|
printf("\t{ %s %s %s },\n", lastbeg, lastend, lastprop);
|
||||||
|
lastprop[0] = 0;
|
||||||
|
}
|
||||||
|
if (lastprop[0] == 0)
|
||||||
|
{
|
||||||
|
strcpy(lastbeg, beg);
|
||||||
|
strcpy(lastprop, prop);
|
||||||
|
}
|
||||||
|
strcpy(lastend, end);
|
||||||
|
}
|
||||||
|
if (lastprop[0])
|
||||||
|
{
|
||||||
|
printf("\t{ %s %s %s },\n", lastbeg, lastend, prop);
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
|
@ -0,0 +1,11 @@
|
||||||
|
#!/bin/sh
|
||||||
|
if [ ! -f "LineBreak.txt" ]; then
|
||||||
|
wget http://unicode.org/Public/UNIDATA/LineBreak.txt
|
||||||
|
fi
|
||||||
|
|
||||||
|
sed -n -f LineBreak1.sed LineBreak.txt > tmp.txt
|
||||||
|
sed -f LineBreak2.sed tmp.txt | ./filter_dup > tmp.c
|
||||||
|
head -2 LineBreak.txt > tmp.txt
|
||||||
|
cat linebreakdata1.tmpl tmp.txt linebreakdata2.tmpl tmp.c linebreakdata3.tmpl > linebreakdata.c
|
||||||
|
rm tmp.txt tmp.c
|
||||||
|
|
|
@ -0,0 +1,734 @@
|
||||||
|
/* vim: set tabstop=4 shiftwidth=4: */
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Line breaking in a Unicode sequence. Designed to be used in a
|
||||||
|
* generic text renderer.
|
||||||
|
*
|
||||||
|
* Copyright (C) 2008-2010 Wu Yongwei <wuyongwei at gmail dot com>
|
||||||
|
*
|
||||||
|
* This software is provided 'as-is', without any express or implied
|
||||||
|
* warranty. In no event will the author be held liable for any damages
|
||||||
|
* arising from the use of this software.
|
||||||
|
*
|
||||||
|
* Permission is granted to anyone to use this software for any purpose,
|
||||||
|
* including commercial applications, and to alter it and redistribute
|
||||||
|
* it freely, subject to the following restrictions:
|
||||||
|
*
|
||||||
|
* 1. The origin of this software must not be misrepresented; you must
|
||||||
|
* not claim that you wrote the original software. If you use this
|
||||||
|
* software in a product, an acknowledgement in the product
|
||||||
|
* documentation would be appreciated but is not required.
|
||||||
|
* 2. Altered source versions must be plainly marked as such, and must
|
||||||
|
* not be misrepresented as being the original software.
|
||||||
|
* 3. This notice may not be removed or altered from any source
|
||||||
|
* distribution.
|
||||||
|
*
|
||||||
|
* The main reference is Unicode Standard Annex 14 (UAX #14):
|
||||||
|
* <URL:http://www.unicode.org/reports/tr14/>
|
||||||
|
*
|
||||||
|
* When this library was designed, this annex was at Revision 19, for
|
||||||
|
* Unicode 5.0.0:
|
||||||
|
* <URL:http://www.unicode.org/reports/tr14/tr14-19.html>
|
||||||
|
*
|
||||||
|
* This library has been updated according to Revision 24, for
|
||||||
|
* Unicode 5.2.0:
|
||||||
|
* <URL:http://www.unicode.org/reports/tr14/tr14-24.html>
|
||||||
|
*
|
||||||
|
* The Unicode Terms of Use are available at
|
||||||
|
* <URL:http://www.unicode.org/copyright.html>
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @file linebreak.c
|
||||||
|
*
|
||||||
|
* Implementation of the line breaking algorithm as described in Unicode
|
||||||
|
* Standard Annex 14.
|
||||||
|
*
|
||||||
|
* @version 2.0, 2010/01/03
|
||||||
|
* @author Wu Yongwei
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <assert.h>
|
||||||
|
#include <stddef.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include "linebreak.h"
|
||||||
|
#include "linebreakdef.h"
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Size of the second-level index to the line breaking properties.
|
||||||
|
*/
|
||||||
|
#define LINEBREAK_INDEX_SIZE 40
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Version number of the library.
|
||||||
|
*/
|
||||||
|
const int linebreak_version = LINEBREAK_VERSION;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Enumeration of break actions. They are used in the break action
|
||||||
|
* pair table below.
|
||||||
|
*/
|
||||||
|
enum BreakAction
|
||||||
|
{
|
||||||
|
DIR_BRK, /**< Direct break opportunity */
|
||||||
|
IND_BRK, /**< Indirect break opportunity */
|
||||||
|
CMI_BRK, /**< Indirect break opportunity for combining marks */
|
||||||
|
CMP_BRK, /**< Prohibited break for combining marks */
|
||||||
|
PRH_BRK /**< Prohibited break */
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Break action pair table. This is a direct mapping of Table 2 of
|
||||||
|
* Unicode Standard Annex 14, Revision 24.
|
||||||
|
*/
|
||||||
|
static enum BreakAction baTable[LBP_JT][LBP_JT] = {
|
||||||
|
{ /* OP */
|
||||||
|
PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,
|
||||||
|
PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,
|
||||||
|
PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, CMP_BRK,
|
||||||
|
PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK },
|
||||||
|
{ /* CL */
|
||||||
|
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, PRH_BRK, PRH_BRK,
|
||||||
|
PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
|
||||||
|
DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
|
||||||
|
PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
|
||||||
|
{ /* CP */
|
||||||
|
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, PRH_BRK, PRH_BRK,
|
||||||
|
PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK,
|
||||||
|
DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
|
||||||
|
PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
|
||||||
|
{ /* QU */
|
||||||
|
PRH_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
|
||||||
|
PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
|
||||||
|
IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, CMI_BRK,
|
||||||
|
PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK },
|
||||||
|
{ /* GL */
|
||||||
|
IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
|
||||||
|
PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
|
||||||
|
IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, CMI_BRK,
|
||||||
|
PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK },
|
||||||
|
{ /* NS */
|
||||||
|
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
|
||||||
|
PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
|
||||||
|
DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
|
||||||
|
PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
|
||||||
|
{ /* EX */
|
||||||
|
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
|
||||||
|
PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
|
||||||
|
DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
|
||||||
|
PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
|
||||||
|
{ /* SY */
|
||||||
|
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
|
||||||
|
PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK,
|
||||||
|
DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
|
||||||
|
PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
|
||||||
|
{ /* IS */
|
||||||
|
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
|
||||||
|
PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK,
|
||||||
|
DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
|
||||||
|
PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
|
||||||
|
{ /* PR */
|
||||||
|
IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
|
||||||
|
PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
|
||||||
|
DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
|
||||||
|
PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK },
|
||||||
|
{ /* PO */
|
||||||
|
IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
|
||||||
|
PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK,
|
||||||
|
DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
|
||||||
|
PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
|
||||||
|
{ /* NU */
|
||||||
|
IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
|
||||||
|
PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK,
|
||||||
|
IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
|
||||||
|
PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
|
||||||
|
{ /* AL */
|
||||||
|
IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
|
||||||
|
PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK,
|
||||||
|
IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
|
||||||
|
PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
|
||||||
|
{ /* ID */
|
||||||
|
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
|
||||||
|
PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
|
||||||
|
IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
|
||||||
|
PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
|
||||||
|
{ /* IN */
|
||||||
|
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
|
||||||
|
PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
|
||||||
|
IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
|
||||||
|
PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
|
||||||
|
{ /* HY */
|
||||||
|
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, DIR_BRK, IND_BRK, PRH_BRK,
|
||||||
|
PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK,
|
||||||
|
DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
|
||||||
|
PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
|
||||||
|
{ /* BA */
|
||||||
|
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, DIR_BRK, IND_BRK, PRH_BRK,
|
||||||
|
PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
|
||||||
|
DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
|
||||||
|
PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
|
||||||
|
{ /* BB */
|
||||||
|
IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
|
||||||
|
PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
|
||||||
|
IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, CMI_BRK,
|
||||||
|
PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK },
|
||||||
|
{ /* B2 */
|
||||||
|
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
|
||||||
|
PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
|
||||||
|
DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, PRH_BRK, PRH_BRK, CMI_BRK,
|
||||||
|
PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
|
||||||
|
{ /* ZW */
|
||||||
|
DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
|
||||||
|
DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
|
||||||
|
DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, PRH_BRK, DIR_BRK,
|
||||||
|
DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
|
||||||
|
{ /* CM */
|
||||||
|
IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
|
||||||
|
PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK,
|
||||||
|
IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
|
||||||
|
PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK },
|
||||||
|
{ /* WJ */
|
||||||
|
IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
|
||||||
|
PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
|
||||||
|
IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK, CMI_BRK,
|
||||||
|
PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK },
|
||||||
|
{ /* H2 */
|
||||||
|
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
|
||||||
|
PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
|
||||||
|
IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
|
||||||
|
PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK },
|
||||||
|
{ /* H3 */
|
||||||
|
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
|
||||||
|
PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
|
||||||
|
IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
|
||||||
|
PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK },
|
||||||
|
{ /* JL */
|
||||||
|
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
|
||||||
|
PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
|
||||||
|
IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
|
||||||
|
PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK },
|
||||||
|
{ /* JV */
|
||||||
|
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
|
||||||
|
PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
|
||||||
|
IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
|
||||||
|
PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK },
|
||||||
|
{ /* JT */
|
||||||
|
DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
|
||||||
|
PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
|
||||||
|
IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK, CMI_BRK,
|
||||||
|
PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK }
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Struct for the second-level index to the line breaking properties.
|
||||||
|
*/
|
||||||
|
struct LineBreakPropertiesIndex
|
||||||
|
{
|
||||||
|
utf32_t end; /**< End coding point */
|
||||||
|
struct LineBreakProperties *lbp;/**< Pointer to line breaking properties */
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Second-level index to the line breaking properties.
|
||||||
|
*/
|
||||||
|
static struct LineBreakPropertiesIndex lb_prop_index[LINEBREAK_INDEX_SIZE] =
|
||||||
|
{
|
||||||
|
{ 0xFFFFFFFF, lb_prop_default }
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Initializes the second-level index to the line breaking properties.
|
||||||
|
* If it is not called, the performance of #get_char_lb_class_lang (and
|
||||||
|
* thus the main functionality) can be pretty bad, especially for big
|
||||||
|
* code points like those of Chinese.
|
||||||
|
*/
|
||||||
|
void init_linebreak(void)
|
||||||
|
{
|
||||||
|
size_t i;
|
||||||
|
size_t iPropDefault;
|
||||||
|
size_t len;
|
||||||
|
size_t step;
|
||||||
|
|
||||||
|
len = 0;
|
||||||
|
while (lb_prop_default[len].prop != LBP_Undefined)
|
||||||
|
++len;
|
||||||
|
step = len / LINEBREAK_INDEX_SIZE;
|
||||||
|
iPropDefault = 0;
|
||||||
|
for (i = 0; i < LINEBREAK_INDEX_SIZE; ++i)
|
||||||
|
{
|
||||||
|
lb_prop_index[i].lbp = lb_prop_default + iPropDefault;
|
||||||
|
iPropDefault += step;
|
||||||
|
lb_prop_index[i].end = lb_prop_default[iPropDefault].start - 1;
|
||||||
|
}
|
||||||
|
lb_prop_index[--i].end = 0xFFFFFFFF;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the language-specific line breaking properties.
|
||||||
|
*
|
||||||
|
* @param lang language of the text
|
||||||
|
* @return pointer to the language-specific line breaking
|
||||||
|
* properties array if found; \c NULL otherwise
|
||||||
|
*/
|
||||||
|
static struct LineBreakProperties *get_lb_prop_lang(const char *lang)
|
||||||
|
{
|
||||||
|
struct LineBreakPropertiesLang *lbplIter;
|
||||||
|
if (lang != NULL)
|
||||||
|
{
|
||||||
|
for (lbplIter = lb_prop_lang_map; lbplIter->lang != NULL; ++lbplIter)
|
||||||
|
{
|
||||||
|
if (strncmp(lang, lbplIter->lang, lbplIter->namelen) == 0)
|
||||||
|
{
|
||||||
|
return lbplIter->lbp;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the line breaking class of a character from a line breaking
|
||||||
|
* properties array.
|
||||||
|
*
|
||||||
|
* @param ch character to check
|
||||||
|
* @param lbp pointer to the line breaking properties array
|
||||||
|
* @return the line breaking class if found; \c LBP_XX otherwise
|
||||||
|
*/
|
||||||
|
static enum LineBreakClass get_char_lb_class(
|
||||||
|
utf32_t ch,
|
||||||
|
struct LineBreakProperties *lbp)
|
||||||
|
{
|
||||||
|
while (lbp->prop != LBP_Undefined && ch >= lbp->start)
|
||||||
|
{
|
||||||
|
if (ch <= lbp->end)
|
||||||
|
return lbp->prop;
|
||||||
|
++lbp;
|
||||||
|
}
|
||||||
|
return LBP_XX;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the line breaking class of a character from the default line
|
||||||
|
* breaking properties array.
|
||||||
|
*
|
||||||
|
* @param ch character to check
|
||||||
|
* @return the line breaking class if found; \c LBP_XX otherwise
|
||||||
|
*/
|
||||||
|
static enum LineBreakClass get_char_lb_class_default(
|
||||||
|
utf32_t ch)
|
||||||
|
{
|
||||||
|
size_t i = 0;
|
||||||
|
while (ch > lb_prop_index[i].end)
|
||||||
|
++i;
|
||||||
|
assert(i < LINEBREAK_INDEX_SIZE);
|
||||||
|
return get_char_lb_class(ch, lb_prop_index[i].lbp);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the line breaking class of a character for a specific
|
||||||
|
* language. This function will check the language-specific data first,
|
||||||
|
* and then the default data if there is no language-specific property
|
||||||
|
* available for the character.
|
||||||
|
*
|
||||||
|
* @param ch character to check
|
||||||
|
* @param lbpLang pointer to the language-specific line breaking
|
||||||
|
* properties array
|
||||||
|
* @return the line breaking class if found; \c LBP_XX
|
||||||
|
* otherwise
|
||||||
|
*/
|
||||||
|
static enum LineBreakClass get_char_lb_class_lang(
|
||||||
|
utf32_t ch,
|
||||||
|
struct LineBreakProperties *lbpLang)
|
||||||
|
{
|
||||||
|
enum LineBreakClass lbcResult;
|
||||||
|
|
||||||
|
/* Find the language-specific line breaking class for a character */
|
||||||
|
if (lbpLang)
|
||||||
|
{
|
||||||
|
lbcResult = get_char_lb_class(ch, lbpLang);
|
||||||
|
if (lbcResult != LBP_XX)
|
||||||
|
return lbcResult;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Find the generic language-specific line breaking class, if no
|
||||||
|
* language context is provided, or language-specific data are not
|
||||||
|
* available for the specific character in the specified language */
|
||||||
|
return get_char_lb_class_default(ch);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Resolves the line breaking class for certain ambiguous or complicated
|
||||||
|
* characters. They are treated in a simplistic way in this
|
||||||
|
* implementation.
|
||||||
|
*
|
||||||
|
* @param lbc line breaking class to resolve
|
||||||
|
* @param lang language of the text
|
||||||
|
* @return the resolved line breaking class
|
||||||
|
*/
|
||||||
|
static enum LineBreakClass resolve_lb_class(
|
||||||
|
enum LineBreakClass lbc,
|
||||||
|
const char *lang)
|
||||||
|
{
|
||||||
|
switch (lbc)
|
||||||
|
{
|
||||||
|
case LBP_AI:
|
||||||
|
if (lang != NULL &&
|
||||||
|
(strncmp(lang, "zh", 2) == 0 || /* Chinese */
|
||||||
|
strncmp(lang, "ja", 2) == 0 || /* Japanese */
|
||||||
|
strncmp(lang, "ko", 2) == 0)) /* Korean */
|
||||||
|
{
|
||||||
|
return LBP_ID;
|
||||||
|
}
|
||||||
|
/* Fall through */
|
||||||
|
case LBP_SA:
|
||||||
|
case LBP_SG:
|
||||||
|
case LBP_XX:
|
||||||
|
return LBP_AL;
|
||||||
|
default:
|
||||||
|
return lbc;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the next Unicode character in a UTF-8 sequence. The index will
|
||||||
|
* be advanced to the next complete character, unless the end of string
|
||||||
|
* is reached in the middle of a UTF-8 sequence.
|
||||||
|
*
|
||||||
|
* @param[in] s input UTF-8 string
|
||||||
|
* @param[in] len length of the string in bytes
|
||||||
|
* @param[in,out] ip pointer to the index
|
||||||
|
* @return the Unicode character beginning at the index; or
|
||||||
|
* #EOS if end of input is encountered
|
||||||
|
*/
|
||||||
|
utf32_t lb_get_next_char_utf8(
|
||||||
|
const utf8_t *s,
|
||||||
|
size_t len,
|
||||||
|
size_t *ip)
|
||||||
|
{
|
||||||
|
utf8_t ch;
|
||||||
|
utf32_t res;
|
||||||
|
|
||||||
|
assert(*ip <= len);
|
||||||
|
if (*ip == len)
|
||||||
|
return EOS;
|
||||||
|
ch = s[*ip];
|
||||||
|
|
||||||
|
if (ch < 0xC2 || ch > 0xF4)
|
||||||
|
{ /* One-byte sequence, tail (should not occur), or invalid */
|
||||||
|
*ip += 1;
|
||||||
|
return ch;
|
||||||
|
}
|
||||||
|
else if (ch < 0xE0)
|
||||||
|
{ /* Two-byte sequence */
|
||||||
|
if (*ip + 2 > len)
|
||||||
|
return EOS;
|
||||||
|
res = ((ch & 0x1F) << 6) + (s[*ip + 1] & 0x3F);
|
||||||
|
*ip += 2;
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
else if (ch < 0xF0)
|
||||||
|
{ /* Three-byte sequence */
|
||||||
|
if (*ip + 3 > len)
|
||||||
|
return EOS;
|
||||||
|
res = ((ch & 0x0F) << 12) +
|
||||||
|
((s[*ip + 1] & 0x3F) << 6) +
|
||||||
|
((s[*ip + 2] & 0x3F));
|
||||||
|
*ip += 3;
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{ /* Four-byte sequence */
|
||||||
|
if (*ip + 4 > len)
|
||||||
|
return EOS;
|
||||||
|
res = ((ch & 0x07) << 18) +
|
||||||
|
((s[*ip + 1] & 0x3F) << 12) +
|
||||||
|
((s[*ip + 2] & 0x3F) << 6) +
|
||||||
|
((s[*ip + 3] & 0x3F));
|
||||||
|
*ip += 4;
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the next Unicode character in a UTF-16 sequence. The index will
|
||||||
|
* be advanced to the next complete character, unless the end of string
|
||||||
|
* is reached in the middle of a UTF-16 surrogate pair.
|
||||||
|
*
|
||||||
|
* @param[in] s input UTF-16 string
|
||||||
|
* @param[in] len length of the string in words
|
||||||
|
* @param[in,out] ip pointer to the index
|
||||||
|
* @return the Unicode character beginning at the index; or
|
||||||
|
* #EOS if end of input is encountered
|
||||||
|
*/
|
||||||
|
utf32_t lb_get_next_char_utf16(
|
||||||
|
const utf16_t *s,
|
||||||
|
size_t len,
|
||||||
|
size_t *ip)
|
||||||
|
{
|
||||||
|
utf16_t ch;
|
||||||
|
|
||||||
|
assert(*ip <= len);
|
||||||
|
if (*ip == len)
|
||||||
|
return EOS;
|
||||||
|
ch = s[(*ip)++];
|
||||||
|
|
||||||
|
if (ch < 0xD800 || ch > 0xDBFF)
|
||||||
|
{ /* If the character is not a high surrogate */
|
||||||
|
return ch;
|
||||||
|
}
|
||||||
|
if (*ip == len)
|
||||||
|
{ /* If the input ends here (an error) */
|
||||||
|
--(*ip);
|
||||||
|
return EOS;
|
||||||
|
}
|
||||||
|
if (s[*ip] < 0xDC00 || s[*ip] > 0xDFFF)
|
||||||
|
{ /* If the next character is not the low surrogate (an error) */
|
||||||
|
return ch;
|
||||||
|
}
|
||||||
|
/* Return the constructed character and advance the index again */
|
||||||
|
return (((utf32_t)ch & 0x3FF) << 10) + (s[(*ip)++] & 0x3FF) + 0x10000;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the next Unicode character in a UTF-32 sequence. The index will
|
||||||
|
* be advanced to the next character.
|
||||||
|
*
|
||||||
|
* @param[in] s input UTF-32 string
|
||||||
|
* @param[in] len length of the string in dwords
|
||||||
|
* @param[in,out] ip pointer to the index
|
||||||
|
* @return the Unicode character beginning at the index; or
|
||||||
|
* #EOS if end of input is encountered
|
||||||
|
*/
|
||||||
|
utf32_t lb_get_next_char_utf32(
|
||||||
|
const utf32_t *s,
|
||||||
|
size_t len,
|
||||||
|
size_t *ip)
|
||||||
|
{
|
||||||
|
assert(*ip <= len);
|
||||||
|
if (*ip == len)
|
||||||
|
return EOS;
|
||||||
|
return s[(*ip)++];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sets the line breaking information for a generic input string.
|
||||||
|
*
|
||||||
|
* @param[in] s input string
|
||||||
|
* @param[in] len length of the input
|
||||||
|
* @param[in] lang language of the input
|
||||||
|
* @param[out] brks pointer to the output breaking data,
|
||||||
|
* containing #LINEBREAK_MUSTBREAK,
|
||||||
|
* #LINEBREAK_ALLOWBREAK, #LINEBREAK_NOBREAK,
|
||||||
|
* or #LINEBREAK_INSIDEACHAR
|
||||||
|
* @param[in] get_next_char function to get the next UTF-32 character
|
||||||
|
*/
|
||||||
|
void set_linebreaks(
|
||||||
|
const void *s,
|
||||||
|
size_t len,
|
||||||
|
const char *lang,
|
||||||
|
char *brks,
|
||||||
|
get_next_char_t get_next_char)
|
||||||
|
{
|
||||||
|
utf32_t ch;
|
||||||
|
enum LineBreakClass lbcCur;
|
||||||
|
enum LineBreakClass lbcNew;
|
||||||
|
enum LineBreakClass lbcLast;
|
||||||
|
struct LineBreakProperties *lbpLang;
|
||||||
|
size_t posCur = 0;
|
||||||
|
size_t posLast = 0;
|
||||||
|
|
||||||
|
--posLast; /* To be ++'d later */
|
||||||
|
ch = get_next_char(s, len, &posCur);
|
||||||
|
if (ch == EOS)
|
||||||
|
return;
|
||||||
|
lbpLang = get_lb_prop_lang(lang);
|
||||||
|
lbcCur = resolve_lb_class(get_char_lb_class_lang(ch, lbpLang), lang);
|
||||||
|
lbcNew = LBP_Undefined;
|
||||||
|
|
||||||
|
nextline:
|
||||||
|
|
||||||
|
/* Special treatment for the first character */
|
||||||
|
switch (lbcCur)
|
||||||
|
{
|
||||||
|
case LBP_LF:
|
||||||
|
case LBP_NL:
|
||||||
|
lbcCur = LBP_BK;
|
||||||
|
break;
|
||||||
|
case LBP_SP:
|
||||||
|
lbcCur = LBP_WJ;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Process a line till an explicit break or end of string */
|
||||||
|
for (;;)
|
||||||
|
{
|
||||||
|
for (++posLast; posLast < posCur - 1; ++posLast)
|
||||||
|
{
|
||||||
|
brks[posLast] = LINEBREAK_INSIDEACHAR;
|
||||||
|
}
|
||||||
|
assert(posLast == posCur - 1);
|
||||||
|
lbcLast = lbcNew;
|
||||||
|
ch = get_next_char(s, len, &posCur);
|
||||||
|
if (ch == EOS)
|
||||||
|
break;
|
||||||
|
lbcNew = get_char_lb_class_lang(ch, lbpLang);
|
||||||
|
if (lbcCur == LBP_BK || (lbcCur == LBP_CR && lbcNew != LBP_LF))
|
||||||
|
{
|
||||||
|
brks[posLast] = LINEBREAK_MUSTBREAK;
|
||||||
|
lbcCur = resolve_lb_class(lbcNew, lang);
|
||||||
|
goto nextline;
|
||||||
|
}
|
||||||
|
|
||||||
|
switch (lbcNew)
|
||||||
|
{
|
||||||
|
case LBP_SP:
|
||||||
|
brks[posLast] = LINEBREAK_NOBREAK;
|
||||||
|
continue;
|
||||||
|
case LBP_BK:
|
||||||
|
case LBP_LF:
|
||||||
|
case LBP_NL:
|
||||||
|
brks[posLast] = LINEBREAK_NOBREAK;
|
||||||
|
lbcCur = LBP_BK;
|
||||||
|
continue;
|
||||||
|
case LBP_CR:
|
||||||
|
brks[posLast] = LINEBREAK_NOBREAK;
|
||||||
|
lbcCur = LBP_CR;
|
||||||
|
continue;
|
||||||
|
case LBP_CB:
|
||||||
|
brks[posLast] = LINEBREAK_ALLOWBREAK;
|
||||||
|
lbcCur = LBP_BA;
|
||||||
|
continue;
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
lbcNew = resolve_lb_class(lbcNew, lang);
|
||||||
|
|
||||||
|
assert(lbcCur <= LBP_JT);
|
||||||
|
assert(lbcNew <= LBP_JT);
|
||||||
|
switch (baTable[lbcCur - 1][lbcNew - 1])
|
||||||
|
{
|
||||||
|
case DIR_BRK:
|
||||||
|
brks[posLast] = LINEBREAK_ALLOWBREAK;
|
||||||
|
break;
|
||||||
|
case CMI_BRK:
|
||||||
|
case IND_BRK:
|
||||||
|
if (lbcLast == LBP_SP)
|
||||||
|
{
|
||||||
|
brks[posLast] = LINEBREAK_ALLOWBREAK;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
brks[posLast] = LINEBREAK_NOBREAK;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case CMP_BRK:
|
||||||
|
brks[posLast] = LINEBREAK_NOBREAK;
|
||||||
|
if (lbcLast != LBP_SP)
|
||||||
|
continue;
|
||||||
|
break;
|
||||||
|
case PRH_BRK:
|
||||||
|
brks[posLast] = LINEBREAK_NOBREAK;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
lbcCur = lbcNew;
|
||||||
|
}
|
||||||
|
|
||||||
|
assert(posLast == posCur - 1 && posCur <= len);
|
||||||
|
/* Break after the last character */
|
||||||
|
brks[posLast] = LINEBREAK_MUSTBREAK;
|
||||||
|
/* When the input contains incomplete sequences */
|
||||||
|
while (posCur < len)
|
||||||
|
{
|
||||||
|
brks[posCur++] = LINEBREAK_INSIDEACHAR;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sets the line breaking information for a UTF-8 input string.
|
||||||
|
*
|
||||||
|
* @param[in] s input UTF-8 string
|
||||||
|
* @param[in] len length of the input
|
||||||
|
* @param[in] lang language of the input
|
||||||
|
* @param[out] brks pointer to the output breaking data, containing
|
||||||
|
* #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
|
||||||
|
* #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
|
||||||
|
*/
|
||||||
|
void set_linebreaks_utf8(
|
||||||
|
const utf8_t *s,
|
||||||
|
size_t len,
|
||||||
|
const char *lang,
|
||||||
|
char *brks)
|
||||||
|
{
|
||||||
|
set_linebreaks(s, len, lang, brks,
|
||||||
|
(get_next_char_t)lb_get_next_char_utf8);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sets the line breaking information for a UTF-16 input string.
|
||||||
|
*
|
||||||
|
* @param[in] s input UTF-16 string
|
||||||
|
* @param[in] len length of the input
|
||||||
|
* @param[in] lang language of the input
|
||||||
|
* @param[out] brks pointer to the output breaking data, containing
|
||||||
|
* #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
|
||||||
|
* #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
|
||||||
|
*/
|
||||||
|
void set_linebreaks_utf16(
|
||||||
|
const utf16_t *s,
|
||||||
|
size_t len,
|
||||||
|
const char *lang,
|
||||||
|
char *brks)
|
||||||
|
{
|
||||||
|
set_linebreaks(s, len, lang, brks,
|
||||||
|
(get_next_char_t)lb_get_next_char_utf16);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sets the line breaking information for a UTF-32 input string.
|
||||||
|
*
|
||||||
|
* @param[in] s input UTF-32 string
|
||||||
|
* @param[in] len length of the input
|
||||||
|
* @param[in] lang language of the input
|
||||||
|
* @param[out] brks pointer to the output breaking data, containing
|
||||||
|
* #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
|
||||||
|
* #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
|
||||||
|
*/
|
||||||
|
void set_linebreaks_utf32(
|
||||||
|
const utf32_t *s,
|
||||||
|
size_t len,
|
||||||
|
const char *lang,
|
||||||
|
char *brks)
|
||||||
|
{
|
||||||
|
set_linebreaks(s, len, lang, brks,
|
||||||
|
(get_next_char_t)lb_get_next_char_utf32);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tells whether a line break can occur between two Unicode characters.
|
||||||
|
* This is a wrapper function to expose a simple interface. Generally
|
||||||
|
* speaking, it is better to use #set_linebreaks_utf32 instead, since
|
||||||
|
* complicated cases involving combining marks, spaces, etc. cannot be
|
||||||
|
* correctly processed.
|
||||||
|
*
|
||||||
|
* @param char1 the first Unicode character
|
||||||
|
* @param char2 the second Unicode character
|
||||||
|
* @param lang language of the input
|
||||||
|
* @return one of #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
|
||||||
|
* #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
|
||||||
|
*/
|
||||||
|
int is_line_breakable(
|
||||||
|
utf32_t char1,
|
||||||
|
utf32_t char2,
|
||||||
|
const char* lang)
|
||||||
|
{
|
||||||
|
utf32_t s[2];
|
||||||
|
char brks[2];
|
||||||
|
s[0] = char1;
|
||||||
|
s[1] = char2;
|
||||||
|
set_linebreaks_utf32(s, 2, lang, brks);
|
||||||
|
return brks[0];
|
||||||
|
}
|
|
@ -0,0 +1,87 @@
|
||||||
|
/* vim: set tabstop=4 shiftwidth=4: */
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Line breaking in a Unicode sequence. Designed to be used in a
|
||||||
|
* generic text renderer.
|
||||||
|
*
|
||||||
|
* Copyright (C) 2008-2010 Wu Yongwei <wuyongwei at gmail dot com>
|
||||||
|
*
|
||||||
|
* This software is provided 'as-is', without any express or implied
|
||||||
|
* warranty. In no event will the author be held liable for any damages
|
||||||
|
* arising from the use of this software.
|
||||||
|
*
|
||||||
|
* Permission is granted to anyone to use this software for any purpose,
|
||||||
|
* including commercial applications, and to alter it and redistribute
|
||||||
|
* it freely, subject to the following restrictions:
|
||||||
|
*
|
||||||
|
* 1. The origin of this software must not be misrepresented; you must
|
||||||
|
* not claim that you wrote the original software. If you use this
|
||||||
|
* software in a product, an acknowledgement in the product
|
||||||
|
* documentation would be appreciated but is not required.
|
||||||
|
* 2. Altered source versions must be plainly marked as such, and must
|
||||||
|
* not be misrepresented as being the original software.
|
||||||
|
* 3. This notice may not be removed or altered from any source
|
||||||
|
* distribution.
|
||||||
|
*
|
||||||
|
* The main reference is Unicode Standard Annex 14 (UAX #14):
|
||||||
|
* <URL:http://www.unicode.org/reports/tr14/>
|
||||||
|
*
|
||||||
|
* When this library was designed, this annex was at Revision 19, for
|
||||||
|
* Unicode 5.0.0:
|
||||||
|
* <URL:http://www.unicode.org/reports/tr14/tr14-19.html>
|
||||||
|
*
|
||||||
|
* This library has been updated according to Revision 24, for
|
||||||
|
* Unicode 5.2.0:
|
||||||
|
* <URL:http://www.unicode.org/reports/tr14/tr14-24.html>
|
||||||
|
*
|
||||||
|
* The Unicode Terms of Use are available at
|
||||||
|
* <URL:http://www.unicode.org/copyright.html>
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @file linebreak.h
|
||||||
|
*
|
||||||
|
* Header file for the line breaking algorithm.
|
||||||
|
*
|
||||||
|
* @version 2.0, 2010/01/03
|
||||||
|
* @author Wu Yongwei
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef LINEBREAK_H
|
||||||
|
#define LINEBREAK_H
|
||||||
|
|
||||||
|
#include <stddef.h>
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define LINEBREAK_VERSION 0x0200 /**< Version of the library linebreak */
|
||||||
|
extern const int linebreak_version;
|
||||||
|
|
||||||
|
#ifndef LINEBREAK_UTF_TYPES_DEFINED
|
||||||
|
#define LINEBREAK_UTF_TYPES_DEFINED
|
||||||
|
typedef unsigned char utf8_t; /**< Type for UTF-8 data points */
|
||||||
|
typedef unsigned short utf16_t; /**< Type for UTF-16 data points */
|
||||||
|
typedef unsigned int utf32_t; /**< Type for UTF-32 data points */
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define LINEBREAK_MUSTBREAK 0 /**< Break is mandatory */
|
||||||
|
#define LINEBREAK_ALLOWBREAK 1 /**< Break is allowed */
|
||||||
|
#define LINEBREAK_NOBREAK 2 /**< No break is possible */
|
||||||
|
#define LINEBREAK_INSIDEACHAR 3 /**< A UTF-8/16 sequence is unfinished */
|
||||||
|
|
||||||
|
void init_linebreak(void);
|
||||||
|
void set_linebreaks_utf8(
|
||||||
|
const utf8_t *s, size_t len, const char* lang, char *brks);
|
||||||
|
void set_linebreaks_utf16(
|
||||||
|
const utf16_t *s, size_t len, const char* lang, char *brks);
|
||||||
|
void set_linebreaks_utf32(
|
||||||
|
const utf32_t *s, size_t len, const char* lang, char *brks);
|
||||||
|
int is_line_breakable(utf32_t char1, utf32_t char2, const char* lang);
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif /* LINEBREAK_H */
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1 @@
|
||||||
|
/* The content of this file is generated from:
|
|
@ -0,0 +1,7 @@
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "linebreak.h"
|
||||||
|
#include "linebreakdef.h"
|
||||||
|
|
||||||
|
/** Default line breaking properties as from the Unicode Web site. */
|
||||||
|
struct LineBreakProperties lb_prop_default[] = {
|
|
@ -0,0 +1,2 @@
|
||||||
|
{ 0xFFFFFFFF, 0xFFFFFFFF, LBP_Undefined }
|
||||||
|
};
|
|
@ -0,0 +1,139 @@
|
||||||
|
/* vim: set tabstop=4 shiftwidth=4: */
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Line breaking in a Unicode sequence. Designed to be used in a
|
||||||
|
* generic text renderer.
|
||||||
|
*
|
||||||
|
* Copyright (C) 2008-2010 Wu Yongwei <wuyongwei at gmail dot com>
|
||||||
|
*
|
||||||
|
* This software is provided 'as-is', without any express or implied
|
||||||
|
* warranty. In no event will the author be held liable for any damages
|
||||||
|
* arising from the use of this software.
|
||||||
|
*
|
||||||
|
* Permission is granted to anyone to use this software for any purpose,
|
||||||
|
* including commercial applications, and to alter it and redistribute
|
||||||
|
* it freely, subject to the following restrictions:
|
||||||
|
*
|
||||||
|
* 1. The origin of this software must not be misrepresented; you must
|
||||||
|
* not claim that you wrote the original software. If you use this
|
||||||
|
* software in a product, an acknowledgement in the product
|
||||||
|
* documentation would be appreciated but is not required.
|
||||||
|
* 2. Altered source versions must be plainly marked as such, and must
|
||||||
|
* not be misrepresented as being the original software.
|
||||||
|
* 3. This notice may not be removed or altered from any source
|
||||||
|
* distribution.
|
||||||
|
*
|
||||||
|
* The main reference is Unicode Standard Annex 14 (UAX #14):
|
||||||
|
* <URL:http://www.unicode.org/reports/tr14/>
|
||||||
|
*
|
||||||
|
* When this library was designed, this annex was at Revision 19, for
|
||||||
|
* Unicode 5.0.0:
|
||||||
|
* <URL:http://www.unicode.org/reports/tr14/tr14-19.html>
|
||||||
|
*
|
||||||
|
* This library has been updated according to Revision 24, for
|
||||||
|
* Unicode 5.2.0:
|
||||||
|
* <URL:http://www.unicode.org/reports/tr14/tr14-24.html>
|
||||||
|
*
|
||||||
|
* The Unicode Terms of Use are available at
|
||||||
|
* <URL:http://www.unicode.org/copyright.html>
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @file linebreakdef.c
|
||||||
|
*
|
||||||
|
* Definition of language-specific data.
|
||||||
|
*
|
||||||
|
* @version 2.0, 2010/01/03
|
||||||
|
* @author Wu Yongwei
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "linebreak.h"
|
||||||
|
#include "linebreakdef.h"
|
||||||
|
|
||||||
|
/**
|
||||||
|
* English-specifc data over the default Unicode rules.
|
||||||
|
*/
|
||||||
|
static struct LineBreakProperties lb_prop_English[] = {
|
||||||
|
{ 0x2018, 0x2018, LBP_OP }, /* Left single quotation mark: opening */
|
||||||
|
{ 0x201C, 0x201C, LBP_OP }, /* Left double quotation mark: opening */
|
||||||
|
{ 0x201D, 0x201D, LBP_CL }, /* Right double quotation mark: closing */
|
||||||
|
{ 0, 0, LBP_Undefined }
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* German-specifc data over the default Unicode rules.
|
||||||
|
*/
|
||||||
|
static struct LineBreakProperties lb_prop_German[] = {
|
||||||
|
{ 0x00AB, 0x00AB, LBP_CL }, /* Left double angle quotation mark: closing */
|
||||||
|
{ 0x00BB, 0x00BB, LBP_OP }, /* Right double angle quotation mark: opening */
|
||||||
|
{ 0x2018, 0x2018, LBP_CL }, /* Left single quotation mark: closing */
|
||||||
|
{ 0x201C, 0x201C, LBP_CL }, /* Left double quotation mark: closing */
|
||||||
|
{ 0x2039, 0x2039, LBP_CL }, /* Left single angle quotation mark: closing */
|
||||||
|
{ 0x203A, 0x203A, LBP_OP }, /* Right single angle quotation mark: opening */
|
||||||
|
{ 0, 0, LBP_Undefined }
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Spanish-specifc data over the default Unicode rules.
|
||||||
|
*/
|
||||||
|
static struct LineBreakProperties lb_prop_Spanish[] = {
|
||||||
|
{ 0x00AB, 0x00AB, LBP_OP }, /* Left double angle quotation mark: opening */
|
||||||
|
{ 0x00BB, 0x00BB, LBP_CL }, /* Right double angle quotation mark: closing */
|
||||||
|
{ 0x2018, 0x2018, LBP_OP }, /* Left single quotation mark: opening */
|
||||||
|
{ 0x201C, 0x201C, LBP_OP }, /* Left double quotation mark: opening */
|
||||||
|
{ 0x201D, 0x201D, LBP_CL }, /* Right double quotation mark: closing */
|
||||||
|
{ 0x2039, 0x2039, LBP_OP }, /* Left single angle quotation mark: opening */
|
||||||
|
{ 0x203A, 0x203A, LBP_CL }, /* Right single angle quotation mark: closing */
|
||||||
|
{ 0, 0, LBP_Undefined }
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* French-specifc data over the default Unicode rules.
|
||||||
|
*/
|
||||||
|
static struct LineBreakProperties lb_prop_French[] = {
|
||||||
|
{ 0x00AB, 0x00AB, LBP_OP }, /* Left double angle quotation mark: opening */
|
||||||
|
{ 0x00BB, 0x00BB, LBP_CL }, /* Right double angle quotation mark: closing */
|
||||||
|
{ 0x2018, 0x2018, LBP_OP }, /* Left single quotation mark: opening */
|
||||||
|
{ 0x201C, 0x201C, LBP_OP }, /* Left double quotation mark: opening */
|
||||||
|
{ 0x201D, 0x201D, LBP_CL }, /* Right double quotation mark: closing */
|
||||||
|
{ 0x2039, 0x2039, LBP_OP }, /* Left single angle quotation mark: opening */
|
||||||
|
{ 0x203A, 0x203A, LBP_CL }, /* Right single angle quotation mark: closing */
|
||||||
|
{ 0, 0, LBP_Undefined }
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Russian-specifc data over the default Unicode rules.
|
||||||
|
*/
|
||||||
|
static struct LineBreakProperties lb_prop_Russian[] = {
|
||||||
|
{ 0x00AB, 0x00AB, LBP_OP }, /* Left double angle quotation mark: opening */
|
||||||
|
{ 0x00BB, 0x00BB, LBP_CL }, /* Right double angle quotation mark: closing */
|
||||||
|
{ 0x201C, 0x201C, LBP_CL }, /* Left double quotation mark: closing */
|
||||||
|
{ 0, 0, LBP_Undefined }
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Chinese-specifc data over the default Unicode rules.
|
||||||
|
*/
|
||||||
|
static struct LineBreakProperties lb_prop_Chinese[] = {
|
||||||
|
{ 0x2018, 0x2018, LBP_OP }, /* Left single quotation mark: opening */
|
||||||
|
{ 0x2019, 0x2019, LBP_CL }, /* Right single quotation mark: closing */
|
||||||
|
{ 0x201C, 0x201C, LBP_OP }, /* Left double quotation mark: opening */
|
||||||
|
{ 0x201D, 0x201D, LBP_CL }, /* Right double quotation mark: closing */
|
||||||
|
{ 0, 0, LBP_Undefined }
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Association data of language-specific line breaking properties with
|
||||||
|
* language names. This is the definition for the static data in this
|
||||||
|
* file. If you want more flexibility, or do not need the data here,
|
||||||
|
* you may want to redefine \e lb_prop_lang_map in your C source file.
|
||||||
|
*/
|
||||||
|
struct LineBreakPropertiesLang lb_prop_lang_map[] = {
|
||||||
|
{ "en", 2, lb_prop_English },
|
||||||
|
{ "de", 2, lb_prop_German },
|
||||||
|
{ "es", 2, lb_prop_Spanish },
|
||||||
|
{ "fr", 2, lb_prop_French },
|
||||||
|
{ "ru", 2, lb_prop_Russian },
|
||||||
|
{ "zh", 2, lb_prop_Chinese },
|
||||||
|
{ NULL, 0, NULL }
|
||||||
|
};
|
|
@ -0,0 +1,149 @@
|
||||||
|
/* vim: set tabstop=4 shiftwidth=4: */
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Line breaking in a Unicode sequence. Designed to be used in a
|
||||||
|
* generic text renderer.
|
||||||
|
*
|
||||||
|
* Copyright (C) 2008-2010 Wu Yongwei <wuyongwei at gmail dot com>
|
||||||
|
*
|
||||||
|
* This software is provided 'as-is', without any express or implied
|
||||||
|
* warranty. In no event will the author be held liable for any damages
|
||||||
|
* arising from the use of this software.
|
||||||
|
*
|
||||||
|
* Permission is granted to anyone to use this software for any purpose,
|
||||||
|
* including commercial applications, and to alter it and redistribute
|
||||||
|
* it freely, subject to the following restrictions:
|
||||||
|
*
|
||||||
|
* 1. The origin of this software must not be misrepresented; you must
|
||||||
|
* not claim that you wrote the original software. If you use this
|
||||||
|
* software in a product, an acknowledgement in the product
|
||||||
|
* documentation would be appreciated but is not required.
|
||||||
|
* 2. Altered source versions must be plainly marked as such, and must
|
||||||
|
* not be misrepresented as being the original software.
|
||||||
|
* 3. This notice may not be removed or altered from any source
|
||||||
|
* distribution.
|
||||||
|
*
|
||||||
|
* The main reference is Unicode Standard Annex 14 (UAX #14):
|
||||||
|
* <URL:http://www.unicode.org/reports/tr14/>
|
||||||
|
*
|
||||||
|
* When this library was designed, this annex was at Revision 19, for
|
||||||
|
* Unicode 5.0.0:
|
||||||
|
* <URL:http://www.unicode.org/reports/tr14/tr14-19.html>
|
||||||
|
*
|
||||||
|
* This library has been updated according to Revision 24, for
|
||||||
|
* Unicode 5.2.0:
|
||||||
|
* <URL:http://www.unicode.org/reports/tr14/tr14-24.html>
|
||||||
|
*
|
||||||
|
* The Unicode Terms of Use are available at
|
||||||
|
* <URL:http://www.unicode.org/copyright.html>
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @file linebreakdef.h
|
||||||
|
*
|
||||||
|
* Definitions of internal data structures, declarations of global
|
||||||
|
* variables, and function prototypes for the line breaking algorithm.
|
||||||
|
*
|
||||||
|
* @version 2.0, 2010/01/03
|
||||||
|
* @author Wu Yongwei
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Constant value to mark the end of string. It is not a valid Unicode
|
||||||
|
* character.
|
||||||
|
*/
|
||||||
|
#define EOS 0xFFFF
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Line break classes. This is a direct mapping of Table 1 of Unicode
|
||||||
|
* Standard Annex 14, Revision 19.
|
||||||
|
*/
|
||||||
|
enum LineBreakClass
|
||||||
|
{
|
||||||
|
/* This is used to signal an error condition. */
|
||||||
|
LBP_Undefined, /**< Undefined */
|
||||||
|
|
||||||
|
/* The following break classes are treated in the pair table. */
|
||||||
|
LBP_OP, /**< Opening punctuation */
|
||||||
|
LBP_CL, /**< Closing punctuation */
|
||||||
|
LBP_CP, /**< Closing parenthesis */
|
||||||
|
LBP_QU, /**< Ambiguous quotation */
|
||||||
|
LBP_GL, /**< Glue */
|
||||||
|
LBP_NS, /**< Non-starters */
|
||||||
|
LBP_EX, /**< Exclamation/Interrogation */
|
||||||
|
LBP_SY, /**< Symbols allowing break after */
|
||||||
|
LBP_IS, /**< Infix separator */
|
||||||
|
LBP_PR, /**< Prefix */
|
||||||
|
LBP_PO, /**< Postfix */
|
||||||
|
LBP_NU, /**< Numeric */
|
||||||
|
LBP_AL, /**< Alphabetic */
|
||||||
|
LBP_ID, /**< Ideographic */
|
||||||
|
LBP_IN, /**< Inseparable characters */
|
||||||
|
LBP_HY, /**< Hyphen */
|
||||||
|
LBP_BA, /**< Break after */
|
||||||
|
LBP_BB, /**< Break before */
|
||||||
|
LBP_B2, /**< Break on either side (but not pair) */
|
||||||
|
LBP_ZW, /**< Zero-width space */
|
||||||
|
LBP_CM, /**< Combining marks */
|
||||||
|
LBP_WJ, /**< Word joiner */
|
||||||
|
LBP_H2, /**< Hangul LV */
|
||||||
|
LBP_H3, /**< Hangul LVT */
|
||||||
|
LBP_JL, /**< Hangul L Jamo */
|
||||||
|
LBP_JV, /**< Hangul V Jamo */
|
||||||
|
LBP_JT, /**< Hangul T Jamo */
|
||||||
|
|
||||||
|
/* The following break classes are not treated in the pair table */
|
||||||
|
LBP_AI, /**< Ambiguous (alphabetic or ideograph) */
|
||||||
|
LBP_BK, /**< Break (mandatory) */
|
||||||
|
LBP_CB, /**< Contingent break */
|
||||||
|
LBP_CR, /**< Carriage return */
|
||||||
|
LBP_LF, /**< Line feed */
|
||||||
|
LBP_NL, /**< Next line */
|
||||||
|
LBP_SA, /**< South-East Asian */
|
||||||
|
LBP_SG, /**< Surrogates */
|
||||||
|
LBP_SP, /**< Space */
|
||||||
|
LBP_XX /**< Unknown */
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Struct for entries of line break properties. The array of the
|
||||||
|
* entries \e must be sorted.
|
||||||
|
*/
|
||||||
|
struct LineBreakProperties
|
||||||
|
{
|
||||||
|
utf32_t start; /**< Starting coding point */
|
||||||
|
utf32_t end; /**< End coding point */
|
||||||
|
enum LineBreakClass prop; /**< The line breaking property */
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Struct for association of language-specific line breaking properties
|
||||||
|
* with language names.
|
||||||
|
*/
|
||||||
|
struct LineBreakPropertiesLang
|
||||||
|
{
|
||||||
|
const char *lang; /**< Language name */
|
||||||
|
size_t namelen; /**< Length of name to match */
|
||||||
|
struct LineBreakProperties *lbp; /**< Pointer to associated data */
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Abstract function interface for #lb_get_next_char_utf8,
|
||||||
|
* #lb_get_next_char_utf16, and #lb_get_next_char_utf32.
|
||||||
|
*/
|
||||||
|
typedef utf32_t (*get_next_char_t)(const void *, size_t, size_t *);
|
||||||
|
|
||||||
|
/* Declarations */
|
||||||
|
extern struct LineBreakProperties lb_prop_default[];
|
||||||
|
extern struct LineBreakPropertiesLang lb_prop_lang_map[];
|
||||||
|
|
||||||
|
/* Function Prototype */
|
||||||
|
utf32_t lb_get_next_char_utf8(const utf8_t *s, size_t len, size_t *ip);
|
||||||
|
utf32_t lb_get_next_char_utf16(const utf16_t *s, size_t len, size_t *ip);
|
||||||
|
utf32_t lb_get_next_char_utf32(const utf32_t *s, size_t len, size_t *ip);
|
||||||
|
void set_linebreaks(
|
||||||
|
const void *s,
|
||||||
|
size_t len,
|
||||||
|
const char *lang,
|
||||||
|
char *brks,
|
||||||
|
get_next_char_t get_next_char);
|
Loading…
Reference in New Issue