summaryrefslogtreecommitdiff
path: root/src/static_libs/libunibreak/linebreak.c
diff options
context:
space:
mode:
authorTom Hacohen <tom@stosb.com>2013-01-11 18:16:09 +0000
committerTom Hacohen <tom@stosb.com>2013-01-11 18:16:09 +0000
commit88ab486e6332ffe3d789e2435ee9a78751f88bf0 (patch)
tree2a2a13369b92122ff157952cb7359f8a4e95a42e /src/static_libs/libunibreak/linebreak.c
parentd83b83e10befda5a8065069a3152846decf91aa3 (diff)
Efl static_libs: Updated liblinebreak -> libunibreak.
SVN revision: 82652
Diffstat (limited to 'src/static_libs/libunibreak/linebreak.c')
-rw-r--r--src/static_libs/libunibreak/linebreak.c787
1 files changed, 787 insertions, 0 deletions
diff --git a/src/static_libs/libunibreak/linebreak.c b/src/static_libs/libunibreak/linebreak.c
new file mode 100644
index 0000000000..c1ea405883
--- /dev/null
+++ b/src/static_libs/libunibreak/linebreak.c
@@ -0,0 +1,787 @@
1/* vim: set tabstop=4 shiftwidth=4: */
2
3/*
4 * Line breaking in a Unicode sequence. Designed to be used in a
5 * generic text renderer.
6 *
7 * Copyright (C) 2008-2012 Wu Yongwei <wuyongwei at gmail dot com>
8 *
9 * This software is provided 'as-is', without any express or implied
10 * warranty. In no event will the author be held liable for any damages
11 * arising from the use of this software.
12 *
13 * Permission is granted to anyone to use this software for any purpose,
14 * including commercial applications, and to alter it and redistribute
15 * it freely, subject to the following restrictions:
16 *
17 * 1. The origin of this software must not be misrepresented; you must
18 * not claim that you wrote the original software. If you use this
19 * software in a product, an acknowledgement in the product
20 * documentation would be appreciated but is not required.
21 * 2. Altered source versions must be plainly marked as such, and must
22 * not be misrepresented as being the original software.
23 * 3. This notice may not be removed or altered from any source
24 * distribution.
25 *
26 * The main reference is Unicode Standard Annex 14 (UAX #14):
27 * <URL:http://www.unicode.org/reports/tr14/>
28 *
29 * When this library was designed, this annex was at Revision 19, for
30 * Unicode 5.0.0:
31 * <URL:http://www.unicode.org/reports/tr14/tr14-19.html>
32 *
33 * This library has been updated according to Revision 30, for
34 * Unicode 6.2.0:
35 * <URL:http://www.unicode.org/reports/tr14/tr14-30.html>
36 *
37 * The Unicode Terms of Use are available at
38 * <URL:http://www.unicode.org/copyright.html>
39 */
40
41/**
42 * @file linebreak.c
43 *
44 * Implementation of the line breaking algorithm as described in Unicode
45 * Standard Annex 14.
46 *
47 * @version 2.3, 2012/10/06
48 * @author Wu Yongwei
49 */
50
51#include <assert.h>
52#include <stddef.h>
53#include <string.h>
54#include "linebreak.h"
55#include "linebreakdef.h"
56
57/**
58 * Size of the second-level index to the line breaking properties.
59 */
60#define LINEBREAK_INDEX_SIZE 40
61
62/**
63 * Version number of the library.
64 */
65const int linebreak_version = LINEBREAK_VERSION;
66
67/**
68 * Enumeration of break actions. They are used in the break action
69 * pair table below.
70 */
71enum BreakAction
72{
73 DIR_BRK, /**< Direct break opportunity */
74 IND_BRK, /**< Indirect break opportunity */
75 CMI_BRK, /**< Indirect break opportunity for combining marks */
76 CMP_BRK, /**< Prohibited break for combining marks */
77 PRH_BRK /**< Prohibited break */
78};
79
80/**
81 * Break action pair table. This is a direct mapping of Table 2 of
82 * Unicode Standard Annex 14, Revision 30.
83 */
84static enum BreakAction baTable[LBP_RI][LBP_RI] = {
85 { /* OP */
86 PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,
87 PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,
88 PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,
89 CMP_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,
90 PRH_BRK },
91 { /* CL */
92 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, PRH_BRK, PRH_BRK,
93 PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
94 DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
95 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
96 DIR_BRK },
97 { /* CP */
98 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, PRH_BRK, PRH_BRK,
99 PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
100 DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
101 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
102 DIR_BRK },
103 { /* QU */
104 PRH_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
105 PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
106 IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
107 CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
108 IND_BRK },
109 { /* GL */
110 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
111 PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
112 IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
113 CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
114 IND_BRK },
115 { /* NS */
116 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
117 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
118 DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
119 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
120 DIR_BRK },
121 { /* EX */
122 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
123 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
124 DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
125 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
126 DIR_BRK },
127 { /* SY */
128 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
129 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK,
130 DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
131 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
132 DIR_BRK },
133 { /* IS */
134 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
135 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
136 DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
137 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
138 DIR_BRK },
139 { /* PR */
140 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
141 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
142 IND_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
143 CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
144 DIR_BRK },
145 { /* PO */
146 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
147 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
148 DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
149 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
150 DIR_BRK },
151 { /* NU */
152 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
153 PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
154 DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
155 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
156 DIR_BRK },
157 { /* AL */
158 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
159 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
160 DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
161 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
162 DIR_BRK },
163 { /* HL */
164 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
165 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
166 DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
167 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
168 DIR_BRK },
169 { /* ID */
170 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
171 PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
172 DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
173 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
174 DIR_BRK },
175 { /* IN */
176 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
177 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
178 DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
179 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
180 DIR_BRK },
181 { /* HY */
182 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, DIR_BRK, IND_BRK, PRH_BRK,
183 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK,
184 DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
185 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
186 DIR_BRK },
187 { /* BA */
188 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, DIR_BRK, IND_BRK, PRH_BRK,
189 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
190 DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
191 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
192 DIR_BRK },
193 { /* BB */
194 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
195 PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
196 IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
197 CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
198 IND_BRK },
199 { /* B2 */
200 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
201 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
202 DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, PRH_BRK, PRH_BRK,
203 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
204 DIR_BRK },
205 { /* ZW */
206 DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
207 DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
208 DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
209 DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
210 DIR_BRK },
211 { /* CM */
212 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
213 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
214 DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
215 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
216 DIR_BRK },
217 { /* WJ */
218 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
219 PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
220 IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
221 CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
222 IND_BRK },
223 { /* H2 */
224 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
225 PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
226 DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
227 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK,
228 DIR_BRK },
229 { /* H3 */
230 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
231 PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
232 DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
233 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK,
234 DIR_BRK },
235 { /* JL */
236 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
237 PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
238 DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
239 CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK,
240 DIR_BRK },
241 { /* JV */
242 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
243 PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
244 DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
245 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK,
246 DIR_BRK },
247 { /* JT */
248 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
249 PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
250 DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
251 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK,
252 DIR_BRK },
253 { /* RI */
254 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
255 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
256 DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
257 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
258 IND_BRK },
259};
260
261/**
262 * Struct for the second-level index to the line breaking properties.
263 */
264struct LineBreakPropertiesIndex
265{
266 utf32_t end; /**< End coding point */
267 struct LineBreakProperties *lbp;/**< Pointer to line breaking properties */
268};
269
270/**
271 * Second-level index to the line breaking properties.
272 */
273static struct LineBreakPropertiesIndex lb_prop_index[LINEBREAK_INDEX_SIZE] =
274{
275 { 0xFFFFFFFF, lb_prop_default }
276};
277
278/**
279 * Initializes the second-level index to the line breaking properties.
280 * If it is not called, the performance of #get_char_lb_class_lang (and
281 * thus the main functionality) can be pretty bad, especially for big
282 * code points like those of Chinese.
283 */
284void init_linebreak(void)
285{
286 size_t i;
287 size_t iPropDefault;
288 size_t len;
289 size_t step;
290
291 len = 0;
292 while (lb_prop_default[len].prop != LBP_Undefined)
293 ++len;
294 step = len / LINEBREAK_INDEX_SIZE;
295 iPropDefault = 0;
296 for (i = 0; i < LINEBREAK_INDEX_SIZE; ++i)
297 {
298 lb_prop_index[i].lbp = lb_prop_default + iPropDefault;
299 iPropDefault += step;
300 lb_prop_index[i].end = lb_prop_default[iPropDefault].start - 1;
301 }
302 lb_prop_index[--i].end = 0xFFFFFFFF;
303}
304
305/**
306 * Gets the language-specific line breaking properties.
307 *
308 * @param lang language of the text
309 * @return pointer to the language-specific line breaking
310 * properties array if found; \c NULL otherwise
311 */
312static struct LineBreakProperties *get_lb_prop_lang(const char *lang)
313{
314 struct LineBreakPropertiesLang *lbplIter;
315 if (lang != NULL)
316 {
317 for (lbplIter = lb_prop_lang_map; lbplIter->lang != NULL; ++lbplIter)
318 {
319 if (strncmp(lang, lbplIter->lang, lbplIter->namelen) == 0)
320 {
321 return lbplIter->lbp;
322 }
323 }
324 }
325 return NULL;
326}
327
328/**
329 * Gets the line breaking class of a character from a line breaking
330 * properties array.
331 *
332 * @param ch character to check
333 * @param lbp pointer to the line breaking properties array
334 * @return the line breaking class if found; \c LBP_XX otherwise
335 */
336static enum LineBreakClass get_char_lb_class(
337 utf32_t ch,
338 struct LineBreakProperties *lbp)
339{
340 while (lbp->prop != LBP_Undefined && ch >= lbp->start)
341 {
342 if (ch <= lbp->end)
343 return lbp->prop;
344 ++lbp;
345 }
346 return LBP_XX;
347}
348
349/**
350 * Gets the line breaking class of a character from the default line
351 * breaking properties array.
352 *
353 * @param ch character to check
354 * @return the line breaking class if found; \c LBP_XX otherwise
355 */
356static enum LineBreakClass get_char_lb_class_default(
357 utf32_t ch)
358{
359 size_t i = 0;
360 while (ch > lb_prop_index[i].end)
361 ++i;
362 assert(i < LINEBREAK_INDEX_SIZE);
363 return get_char_lb_class(ch, lb_prop_index[i].lbp);
364}
365
366/**
367 * Gets the line breaking class of a character for a specific
368 * language. This function will check the language-specific data first,
369 * and then the default data if there is no language-specific property
370 * available for the character.
371 *
372 * @param ch character to check
373 * @param lbpLang pointer to the language-specific line breaking
374 * properties array
375 * @return the line breaking class if found; \c LBP_XX
376 * otherwise
377 */
378static enum LineBreakClass get_char_lb_class_lang(
379 utf32_t ch,
380 struct LineBreakProperties *lbpLang)
381{
382 enum LineBreakClass lbcResult;
383
384 /* Find the language-specific line breaking class for a character */
385 if (lbpLang)
386 {
387 lbcResult = get_char_lb_class(ch, lbpLang);
388 if (lbcResult != LBP_XX)
389 return lbcResult;
390 }
391
392 /* Find the generic language-specific line breaking class, if no
393 * language context is provided, or language-specific data are not
394 * available for the specific character in the specified language */
395 return get_char_lb_class_default(ch);
396}
397
398/**
399 * Resolves the line breaking class for certain ambiguous or complicated
400 * characters. They are treated in a simplistic way in this
401 * implementation.
402 *
403 * @param lbc line breaking class to resolve
404 * @param lang language of the text
405 * @return the resolved line breaking class
406 */
407static enum LineBreakClass resolve_lb_class(
408 enum LineBreakClass lbc,
409 const char *lang)
410{
411 switch (lbc)
412 {
413 case LBP_AI:
414 if (lang != NULL &&
415 (strncmp(lang, "zh", 2) == 0 || /* Chinese */
416 strncmp(lang, "ja", 2) == 0 || /* Japanese */
417 strncmp(lang, "ko", 2) == 0)) /* Korean */
418 {
419 return LBP_ID;
420 }
421 else
422 {
423 return LBP_AL;
424 }
425 case LBP_CJ:
426 /* Simplified for `normal' line breaking. See
427 * <url:http://www.unicode.org/reports/tr14/tr14-28.html#CJ>
428 * for details. */
429 return LBP_ID;
430 case LBP_SA:
431 case LBP_SG:
432 case LBP_XX:
433 return LBP_AL;
434 default:
435 return lbc;
436 }
437}
438
439/**
440 * Gets the next Unicode character in a UTF-8 sequence. The index will
441 * be advanced to the next complete character, unless the end of string
442 * is reached in the middle of a UTF-8 sequence.
443 *
444 * @param[in] s input UTF-8 string
445 * @param[in] len length of the string in bytes
446 * @param[in,out] ip pointer to the index
447 * @return the Unicode character beginning at the index; or
448 * #EOS if end of input is encountered
449 */
450utf32_t lb_get_next_char_utf8(
451 const utf8_t *s,
452 size_t len,
453 size_t *ip)
454{
455 utf8_t ch;
456 utf32_t res;
457
458 assert(*ip <= len);
459 if (*ip == len)
460 return EOS;
461 ch = s[*ip];
462
463 if (ch < 0xC2 || ch > 0xF4)
464 { /* One-byte sequence, tail (should not occur), or invalid */
465 *ip += 1;
466 return ch;
467 }
468 else if (ch < 0xE0)
469 { /* Two-byte sequence */
470 if (*ip + 2 > len)
471 return EOS;
472 res = ((ch & 0x1F) << 6) + (s[*ip + 1] & 0x3F);
473 *ip += 2;
474 return res;
475 }
476 else if (ch < 0xF0)
477 { /* Three-byte sequence */
478 if (*ip + 3 > len)
479 return EOS;
480 res = ((ch & 0x0F) << 12) +
481 ((s[*ip + 1] & 0x3F) << 6) +
482 ((s[*ip + 2] & 0x3F));
483 *ip += 3;
484 return res;
485 }
486 else
487 { /* Four-byte sequence */
488 if (*ip + 4 > len)
489 return EOS;
490 res = ((ch & 0x07) << 18) +
491 ((s[*ip + 1] & 0x3F) << 12) +
492 ((s[*ip + 2] & 0x3F) << 6) +
493 ((s[*ip + 3] & 0x3F));
494 *ip += 4;
495 return res;
496 }
497}
498
499/**
500 * Gets the next Unicode character in a UTF-16 sequence. The index will
501 * be advanced to the next complete character, unless the end of string
502 * is reached in the middle of a UTF-16 surrogate pair.
503 *
504 * @param[in] s input UTF-16 string
505 * @param[in] len length of the string in words
506 * @param[in,out] ip pointer to the index
507 * @return the Unicode character beginning at the index; or
508 * #EOS if end of input is encountered
509 */
510utf32_t lb_get_next_char_utf16(
511 const utf16_t *s,
512 size_t len,
513 size_t *ip)
514{
515 utf16_t ch;
516
517 assert(*ip <= len);
518 if (*ip == len)
519 return EOS;
520 ch = s[(*ip)++];
521
522 if (ch < 0xD800 || ch > 0xDBFF)
523 { /* If the character is not a high surrogate */
524 return ch;
525 }
526 if (*ip == len)
527 { /* If the input ends here (an error) */
528 --(*ip);
529 return EOS;
530 }
531 if (s[*ip] < 0xDC00 || s[*ip] > 0xDFFF)
532 { /* If the next character is not the low surrogate (an error) */
533 return ch;
534 }
535 /* Return the constructed character and advance the index again */
536 return (((utf32_t)ch & 0x3FF) << 10) + (s[(*ip)++] & 0x3FF) + 0x10000;
537}
538
539/**
540 * Gets the next Unicode character in a UTF-32 sequence. The index will
541 * be advanced to the next character.
542 *
543 * @param[in] s input UTF-32 string
544 * @param[in] len length of the string in dwords
545 * @param[in,out] ip pointer to the index
546 * @return the Unicode character beginning at the index; or
547 * #EOS if end of input is encountered
548 */
549utf32_t lb_get_next_char_utf32(
550 const utf32_t *s,
551 size_t len,
552 size_t *ip)
553{
554 assert(*ip <= len);
555 if (*ip == len)
556 return EOS;
557 return s[(*ip)++];
558}
559
560/**
561 * Sets the line breaking information for a generic input string.
562 *
563 * @param[in] s input string
564 * @param[in] len length of the input
565 * @param[in] lang language of the input
566 * @param[out] brks pointer to the output breaking data,
567 * containing #LINEBREAK_MUSTBREAK,
568 * #LINEBREAK_ALLOWBREAK, #LINEBREAK_NOBREAK,
569 * or #LINEBREAK_INSIDEACHAR
570 * @param[in] get_next_char function to get the next UTF-32 character
571 */
572void set_linebreaks(
573 const void *s,
574 size_t len,
575 const char *lang,
576 char *brks,
577 get_next_char_t get_next_char)
578{
579 utf32_t ch;
580 enum LineBreakClass lbcCur;
581 enum LineBreakClass lbcNew;
582 enum LineBreakClass lbcLast;
583 struct LineBreakProperties *lbpLang;
584 size_t posCur = 0;
585 size_t posLast = 0;
586
587 --posLast; /* To be ++'d later */
588 ch = get_next_char(s, len, &posCur);
589 if (ch == EOS)
590 return;
591 lbpLang = get_lb_prop_lang(lang);
592 lbcCur = resolve_lb_class(get_char_lb_class_lang(ch, lbpLang), lang);
593 lbcNew = LBP_Undefined;
594
595nextline:
596
597 /* Special treatment for the first character */
598 switch (lbcCur)
599 {
600 case LBP_LF:
601 case LBP_NL:
602 lbcCur = LBP_BK;
603 break;
604 case LBP_CB:
605 lbcCur = LBP_BA;
606 break;
607 case LBP_SP:
608 lbcCur = LBP_WJ;
609 break;
610 default:
611 break;
612 }
613
614 /* Process a line till an explicit break or end of string */
615 for (;;)
616 {
617 for (++posLast; posLast < posCur - 1; ++posLast)
618 {
619 brks[posLast] = LINEBREAK_INSIDEACHAR;
620 }
621 assert(posLast == posCur - 1);
622 lbcLast = lbcNew;
623 ch = get_next_char(s, len, &posCur);
624 if (ch == EOS)
625 break;
626 lbcNew = get_char_lb_class_lang(ch, lbpLang);
627 if (lbcCur == LBP_BK || (lbcCur == LBP_CR && lbcNew != LBP_LF))
628 {
629 brks[posLast] = LINEBREAK_MUSTBREAK;
630 lbcCur = resolve_lb_class(lbcNew, lang);
631 goto nextline;
632 }
633
634 switch (lbcNew)
635 {
636 case LBP_SP:
637 brks[posLast] = LINEBREAK_NOBREAK;
638 continue;
639 case LBP_BK:
640 case LBP_LF:
641 case LBP_NL:
642 brks[posLast] = LINEBREAK_NOBREAK;
643 lbcCur = LBP_BK;
644 continue;
645 case LBP_CR:
646 brks[posLast] = LINEBREAK_NOBREAK;
647 lbcCur = LBP_CR;
648 continue;
649 case LBP_CB:
650 brks[posLast] = LINEBREAK_ALLOWBREAK;
651 lbcCur = LBP_BA;
652 continue;
653 default:
654 break;
655 }
656
657 lbcNew = resolve_lb_class(lbcNew, lang);
658
659 /* TODO: LB21a, as introduced by Revision 28 of UAX#14, is not
660 * yet implemented below. */
661
662 assert(lbcCur <= LBP_JT);
663 assert(lbcNew <= LBP_JT);
664 switch (baTable[lbcCur - 1][lbcNew - 1])
665 {
666 case DIR_BRK:
667 brks[posLast] = LINEBREAK_ALLOWBREAK;
668 break;
669 case CMI_BRK:
670 case IND_BRK:
671 if (lbcLast == LBP_SP)
672 {
673 brks[posLast] = LINEBREAK_ALLOWBREAK;
674 }
675 else
676 {
677 brks[posLast] = LINEBREAK_NOBREAK;
678 }
679 break;
680 case CMP_BRK:
681 brks[posLast] = LINEBREAK_NOBREAK;
682 if (lbcLast != LBP_SP)
683 continue;
684 break;
685 case PRH_BRK:
686 brks[posLast] = LINEBREAK_NOBREAK;
687 break;
688 }
689
690 lbcCur = lbcNew;
691 }
692
693 assert(posLast == posCur - 1 && posCur <= len);
694 /* Break after the last character */
695 brks[posLast] = LINEBREAK_MUSTBREAK;
696 /* When the input contains incomplete sequences */
697 while (posCur < len)
698 {
699 brks[posCur++] = LINEBREAK_INSIDEACHAR;
700 }
701}
702
703/**
704 * Sets the line breaking information for a UTF-8 input string.
705 *
706 * @param[in] s input UTF-8 string
707 * @param[in] len length of the input
708 * @param[in] lang language of the input
709 * @param[out] brks pointer to the output breaking data, containing
710 * #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
711 * #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
712 */
713void set_linebreaks_utf8(
714 const utf8_t *s,
715 size_t len,
716 const char *lang,
717 char *brks)
718{
719 set_linebreaks(s, len, lang, brks,
720 (get_next_char_t)lb_get_next_char_utf8);
721}
722
723/**
724 * Sets the line breaking information for a UTF-16 input string.
725 *
726 * @param[in] s input UTF-16 string
727 * @param[in] len length of the input
728 * @param[in] lang language of the input
729 * @param[out] brks pointer to the output breaking data, containing
730 * #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
731 * #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
732 */
733void set_linebreaks_utf16(
734 const utf16_t *s,
735 size_t len,
736 const char *lang,
737 char *brks)
738{
739 set_linebreaks(s, len, lang, brks,
740 (get_next_char_t)lb_get_next_char_utf16);
741}
742
743/**
744 * Sets the line breaking information for a UTF-32 input string.
745 *
746 * @param[in] s input UTF-32 string
747 * @param[in] len length of the input
748 * @param[in] lang language of the input
749 * @param[out] brks pointer to the output breaking data, containing
750 * #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
751 * #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
752 */
753void set_linebreaks_utf32(
754 const utf32_t *s,
755 size_t len,
756 const char *lang,
757 char *brks)
758{
759 set_linebreaks(s, len, lang, brks,
760 (get_next_char_t)lb_get_next_char_utf32);
761}
762
763/**
764 * Tells whether a line break can occur between two Unicode characters.
765 * This is a wrapper function to expose a simple interface. Generally
766 * speaking, it is better to use #set_linebreaks_utf32 instead, since
767 * complicated cases involving combining marks, spaces, etc. cannot be
768 * correctly processed.
769 *
770 * @param char1 the first Unicode character
771 * @param char2 the second Unicode character
772 * @param lang language of the input
773 * @return one of #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
774 * #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
775 */
776int is_line_breakable(
777 utf32_t char1,
778 utf32_t char2,
779 const char* lang)
780{
781 utf32_t s[2];
782 char brks[2];
783 s[0] = char1;
784 s[1] = char2;
785 set_linebreaks_utf32(s, 2, lang, brks);
786 return brks[0];
787}