summaryrefslogtreecommitdiff
path: root/src/static_libs/libunibreak/linebreak.c
diff options
context:
space:
mode:
authorTom Hacohen <tom@stosb.com>2015-05-07 10:02:40 +0100
committerTom Hacohen <tom@stosb.com>2015-05-07 10:03:26 +0100
commita2a9f33802a3923c1469789f66d5fdab1eaea943 (patch)
tree8c114ee6b77a5877dff96a62f90b2f9b23d50f2c /src/static_libs/libunibreak/linebreak.c
parent92ff90ecca98f9e8e66a1f7a3ecf4e46f65913d4 (diff)
Static deps: Move unibreak to be an external dep.
We need any version of libunibreak. The first one has been released in mid 2012. Even slow distros like ubuntu already have an LTS out with a good enough version, so I consider this enough to remove the maintenance cost. This has been discussed on IRC. @feature
Diffstat (limited to 'src/static_libs/libunibreak/linebreak.c')
-rw-r--r--src/static_libs/libunibreak/linebreak.c879
1 files changed, 0 insertions, 879 deletions
diff --git a/src/static_libs/libunibreak/linebreak.c b/src/static_libs/libunibreak/linebreak.c
deleted file mode 100644
index 9716df4860..0000000000
--- a/src/static_libs/libunibreak/linebreak.c
+++ /dev/null
@@ -1,879 +0,0 @@
1/* vim: set expandtab tabstop=4 softtabstop=4 shiftwidth=4: */
2
3/*
4 * Line breaking in a Unicode sequence. Designed to be used in a
5 * generic text renderer.
6 *
7 * Copyright (C) 2008-2013 Wu Yongwei <wuyongwei at gmail dot com>
8 * Copyright (C) 2013 Petr Filipsky <philodej at gmail dot com>
9 *
10 * This software is provided 'as-is', without any express or implied
11 * warranty. In no event will the author be held liable for any damages
12 * arising from the use of this software.
13 *
14 * Permission is granted to anyone to use this software for any purpose,
15 * including commercial applications, and to alter it and redistribute
16 * it freely, subject to the following restrictions:
17 *
18 * 1. The origin of this software must not be misrepresented; you must
19 * not claim that you wrote the original software. If you use this
20 * software in a product, an acknowledgement in the product
21 * documentation would be appreciated but is not required.
22 * 2. Altered source versions must be plainly marked as such, and must
23 * not be misrepresented as being the original software.
24 * 3. This notice may not be removed or altered from any source
25 * distribution.
26 *
27 * The main reference is Unicode Standard Annex 14 (UAX #14):
28 * <URL:http://www.unicode.org/reports/tr14/>
29 *
30 * When this library was designed, this annex was at Revision 19, for
31 * Unicode 5.0.0:
32 * <URL:http://www.unicode.org/reports/tr14/tr14-19.html>
33 *
34 * This library has been updated according to Revision 30, for
35 * Unicode 6.2.0:
36 * <URL:http://www.unicode.org/reports/tr14/tr14-30.html>
37 *
38 * The Unicode Terms of Use are available at
39 * <URL:http://www.unicode.org/copyright.html>
40 */
41
42/**
43 * @file linebreak.c
44 *
45 * Implementation of the line breaking algorithm as described in Unicode
46 * Standard Annex 14.
47 *
48 * @version 2.5, 2013/11/14
49 * @author Wu Yongwei
50 * @author Petr Filipsky
51 */
52
53#include <assert.h>
54#include <stddef.h>
55#include <string.h>
56#include "linebreak.h"
57#include "linebreakdef.h"
58
59/**
60 * Special value used internally to indicate an undefined break result.
61 */
62#define LINEBREAK_UNDEFINED -1
63
64/**
65 * Size of the second-level index to the line breaking properties.
66 */
67#define LINEBREAK_INDEX_SIZE 40
68
69/**
70 * Version number of the library.
71 */
72const int linebreak_version = LINEBREAK_VERSION;
73
74/**
75 * Enumeration of break actions. They are used in the break action
76 * pair table below.
77 */
78enum BreakAction
79{
80 DIR_BRK, /**< Direct break opportunity */
81 IND_BRK, /**< Indirect break opportunity */
82 CMI_BRK, /**< Indirect break opportunity for combining marks */
83 CMP_BRK, /**< Prohibited break for combining marks */
84 PRH_BRK /**< Prohibited break */
85};
86
87/**
88 * Break action pair table. This is a direct mapping of Table 2 of
89 * Unicode Standard Annex 14, Revision 30.
90 */
91static enum BreakAction baTable[LBP_RI][LBP_RI] = {
92 { /* OP */
93 PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,
94 PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,
95 PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,
96 CMP_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK, PRH_BRK,
97 PRH_BRK },
98 { /* CL */
99 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, PRH_BRK, PRH_BRK,
100 PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
101 DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
102 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
103 DIR_BRK },
104 { /* CP */
105 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, PRH_BRK, PRH_BRK,
106 PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
107 DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
108 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
109 DIR_BRK },
110 { /* QU */
111 PRH_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
112 PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
113 IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
114 CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
115 IND_BRK },
116 { /* GL */
117 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
118 PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
119 IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
120 CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
121 IND_BRK },
122 { /* NS */
123 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
124 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
125 DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
126 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
127 DIR_BRK },
128 { /* EX */
129 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
130 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
131 DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
132 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
133 DIR_BRK },
134 { /* SY */
135 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
136 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK,
137 DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
138 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
139 DIR_BRK },
140 { /* IS */
141 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
142 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
143 DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
144 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
145 DIR_BRK },
146 { /* PR */
147 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
148 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
149 IND_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
150 CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
151 DIR_BRK },
152 { /* PO */
153 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
154 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
155 DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
156 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
157 DIR_BRK },
158 { /* NU */
159 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
160 PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
161 DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
162 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
163 DIR_BRK },
164 { /* AL */
165 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
166 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
167 DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
168 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
169 DIR_BRK },
170 { /* HL */
171 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
172 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
173 DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
174 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
175 DIR_BRK },
176 { /* ID */
177 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
178 PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
179 DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
180 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
181 DIR_BRK },
182 { /* IN */
183 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
184 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
185 DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
186 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
187 DIR_BRK },
188 { /* HY */
189 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, DIR_BRK, IND_BRK, PRH_BRK,
190 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK,
191 DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
192 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
193 DIR_BRK },
194 { /* BA */
195 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, DIR_BRK, IND_BRK, PRH_BRK,
196 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
197 DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
198 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
199 DIR_BRK },
200 { /* BB */
201 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
202 PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
203 IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
204 CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
205 IND_BRK },
206 { /* B2 */
207 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
208 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
209 DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, PRH_BRK, PRH_BRK,
210 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
211 DIR_BRK },
212 { /* ZW */
213 DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
214 DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
215 DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
216 DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
217 DIR_BRK },
218 { /* CM */
219 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
220 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, IND_BRK,
221 DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
222 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
223 DIR_BRK },
224 { /* WJ */
225 IND_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
226 PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
227 IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
228 CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK,
229 IND_BRK },
230 { /* H2 */
231 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
232 PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
233 DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
234 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK,
235 DIR_BRK },
236 { /* H3 */
237 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
238 PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
239 DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
240 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK,
241 DIR_BRK },
242 { /* JL */
243 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
244 PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
245 DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
246 CMI_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK,
247 DIR_BRK },
248 { /* JV */
249 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
250 PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
251 DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
252 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK, IND_BRK,
253 DIR_BRK },
254 { /* JT */
255 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
256 PRH_BRK, PRH_BRK, DIR_BRK, IND_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
257 DIR_BRK, IND_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
258 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, IND_BRK,
259 DIR_BRK },
260 { /* RI */
261 DIR_BRK, PRH_BRK, PRH_BRK, IND_BRK, IND_BRK, IND_BRK, PRH_BRK,
262 PRH_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
263 DIR_BRK, DIR_BRK, IND_BRK, IND_BRK, DIR_BRK, DIR_BRK, PRH_BRK,
264 CMI_BRK, PRH_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK, DIR_BRK,
265 IND_BRK },
266};
267
268/**
269 * Struct for the second-level index to the line breaking properties.
270 */
271struct LineBreakPropertiesIndex
272{
273 utf32_t end; /**< End coding point */
274 struct LineBreakProperties *lbp;/**< Pointer to line breaking properties */
275};
276
277/**
278 * Second-level index to the line breaking properties.
279 */
280static struct LineBreakPropertiesIndex lb_prop_index[LINEBREAK_INDEX_SIZE] =
281{
282 { 0xFFFFFFFF, lb_prop_default }
283};
284
285/**
286 * Initializes the second-level index to the line breaking properties.
287 * If it is not called, the performance of #get_char_lb_class_lang (and
288 * thus the main functionality) can be pretty bad, especially for big
289 * code points like those of Chinese.
290 */
291void init_linebreak(void)
292{
293 size_t i;
294 size_t iPropDefault;
295 size_t len;
296 size_t step;
297
298 len = 0;
299 while (lb_prop_default[len].prop != LBP_Undefined)
300 ++len;
301 step = len / LINEBREAK_INDEX_SIZE;
302 iPropDefault = 0;
303 for (i = 0; i < LINEBREAK_INDEX_SIZE; ++i)
304 {
305 lb_prop_index[i].lbp = lb_prop_default + iPropDefault;
306 iPropDefault += step;
307 lb_prop_index[i].end = lb_prop_default[iPropDefault].start - 1;
308 }
309 lb_prop_index[--i].end = 0xFFFFFFFF;
310}
311
312/**
313 * Gets the language-specific line breaking properties.
314 *
315 * @param lang language of the text
316 * @return pointer to the language-specific line breaking
317 * properties array if found; \c NULL otherwise
318 */
319static struct LineBreakProperties *get_lb_prop_lang(const char *lang)
320{
321 struct LineBreakPropertiesLang *lbplIter;
322 if (lang != NULL)
323 {
324 for (lbplIter = lb_prop_lang_map; lbplIter->lang != NULL; ++lbplIter)
325 {
326 if (strncmp(lang, lbplIter->lang, lbplIter->namelen) == 0)
327 {
328 return lbplIter->lbp;
329 }
330 }
331 }
332 return NULL;
333}
334
335/**
336 * Gets the line breaking class of a character from a line breaking
337 * properties array.
338 *
339 * @param ch character to check
340 * @param lbp pointer to the line breaking properties array
341 * @return the line breaking class if found; \c LBP_XX otherwise
342 */
343static enum LineBreakClass get_char_lb_class(
344 utf32_t ch,
345 struct LineBreakProperties *lbp)
346{
347 while (lbp->prop != LBP_Undefined && ch >= lbp->start)
348 {
349 if (ch <= lbp->end)
350 return lbp->prop;
351 ++lbp;
352 }
353 return LBP_XX;
354}
355
356/**
357 * Gets the line breaking class of a character from the default line
358 * breaking properties array.
359 *
360 * @param ch character to check
361 * @return the line breaking class if found; \c LBP_XX otherwise
362 */
363static enum LineBreakClass get_char_lb_class_default(
364 utf32_t ch)
365{
366 size_t i = 0;
367 while (ch > lb_prop_index[i].end)
368 ++i;
369 assert(i < LINEBREAK_INDEX_SIZE);
370 return get_char_lb_class(ch, lb_prop_index[i].lbp);
371}
372
373/**
374 * Gets the line breaking class of a character for a specific
375 * language. This function will check the language-specific data first,
376 * and then the default data if there is no language-specific property
377 * available for the character.
378 *
379 * @param ch character to check
380 * @param lbpLang pointer to the language-specific line breaking
381 * properties array
382 * @return the line breaking class if found; \c LBP_XX
383 * otherwise
384 */
385static enum LineBreakClass get_char_lb_class_lang(
386 utf32_t ch,
387 struct LineBreakProperties *lbpLang)
388{
389 enum LineBreakClass lbcResult;
390
391 /* Find the language-specific line breaking class for a character */
392 if (lbpLang)
393 {
394 lbcResult = get_char_lb_class(ch, lbpLang);
395 if (lbcResult != LBP_XX)
396 return lbcResult;
397 }
398
399 /* Find the generic language-specific line breaking class, if no
400 * language context is provided, or language-specific data are not
401 * available for the specific character in the specified language */
402 return get_char_lb_class_default(ch);
403}
404
405/**
406 * Resolves the line breaking class for certain ambiguous or complicated
407 * characters. They are treated in a simplistic way in this
408 * implementation.
409 *
410 * @param lbc line breaking class to resolve
411 * @param lang language of the text
412 * @return the resolved line breaking class
413 */
414static enum LineBreakClass resolve_lb_class(
415 enum LineBreakClass lbc,
416 const char *lang)
417{
418 switch (lbc)
419 {
420 case LBP_AI:
421 if (lang != NULL &&
422 (strncmp(lang, "zh", 2) == 0 || /* Chinese */
423 strncmp(lang, "ja", 2) == 0 || /* Japanese */
424 strncmp(lang, "ko", 2) == 0)) /* Korean */
425 {
426 return LBP_ID;
427 }
428 else
429 {
430 return LBP_AL;
431 }
432 case LBP_CJ:
433 /* Simplified for `normal' line breaking. See
434 * <url:http://www.unicode.org/reports/tr14/tr14-30.html#CJ>
435 * for details. */
436 return LBP_ID;
437 case LBP_SA:
438 case LBP_SG:
439 case LBP_XX:
440 return LBP_AL;
441 default:
442 return lbc;
443 }
444}
445
446/**
447 * Treats specially for the first character in a line.
448 *
449 * @param[in,out] lbpCtx pointer to the line breaking context
450 * @pre \a lbpCtx->lbcCur has a valid line break class
451 * @post \a lbpCtx->lbcCur has the updated line break class
452 */
453static void treat_first_char(
454 struct LineBreakContext* lbpCtx)
455{
456 switch (lbpCtx->lbcCur)
457 {
458 case LBP_LF:
459 case LBP_NL:
460 lbpCtx->lbcCur = LBP_BK; /* Rule LB5 */
461 break;
462 case LBP_CB:
463 lbpCtx->lbcCur = LBP_BA; /* Rule LB20 */
464 break;
465 case LBP_SP:
466 lbpCtx->lbcCur = LBP_WJ; /* Leading space treated as WJ */
467 break;
468 default:
469 break;
470 }
471}
472
473/**
474 * Tries telling the line break opportunity by simple rules.
475 *
476 * @param[in,out] lbpCtx pointer to the line breaking context
477 * @pre \a lbpCtx->lbcCur has the current line break
478 * class; and \a lbpCtx->lbcNew has the line
479 * break class for the next character
480 * @post \a lbpCtx->lbcCur has the updated line break
481 * class
482 * @return break result, one of #LINEBREAK_MUSTBREAK,
483 * #LINEBREAK_ALLOWBREAK, and #LINEBREAK_NOBREAK
484 * if identified; or #LINEBREAK_UNDEFINED if
485 * table lookup is needed
486 */
487static int get_lb_result_simple(
488 struct LineBreakContext* lbpCtx)
489{
490 if (lbpCtx->lbcCur == LBP_BK
491 || (lbpCtx->lbcCur == LBP_CR && lbpCtx->lbcNew != LBP_LF))
492 {
493 return LINEBREAK_MUSTBREAK; /* Rules LB4 and LB5 */
494 }
495
496 switch (lbpCtx->lbcNew)
497 {
498 case LBP_SP:
499 return LINEBREAK_NOBREAK; /* Rule LB7; no change to lbcCur */
500 case LBP_BK:
501 case LBP_LF:
502 case LBP_NL:
503 lbpCtx->lbcCur = LBP_BK; /* Mandatory break after */
504 return LINEBREAK_NOBREAK; /* Rule LB6 */
505 case LBP_CR:
506 lbpCtx->lbcCur = LBP_CR;
507 return LINEBREAK_NOBREAK; /* Rule LB6 */
508 case LBP_CB:
509 lbpCtx->lbcCur = LBP_BA;
510 return LINEBREAK_ALLOWBREAK; /* Rule LB20 */
511 default:
512 return LINEBREAK_UNDEFINED; /* Table lookup is needed */
513 }
514}
515
516/**
517 * Tells the line break opportunity by table lookup.
518 *
519 * @param[in,out] lbpCtx pointer to the line breaking context
520 * @pre \a lbpCtx->lbcCur has the current line break
521 * class; \a lbpCtx->lbcLast has the line break
522 * class for the last character; and \a
523 * lbcCur->lbcNew has the line break class for
524 * the next character
525 * @post \a lbpCtx->lbcCur has the updated line break
526 * class
527 * @return break result, one of #LINEBREAK_MUSTBREAK,
528 * #LINEBREAK_ALLOWBREAK, and #LINEBREAK_NOBREAK
529 */
530static int get_lb_result_lookup(
531 struct LineBreakContext* lbpCtx)
532{
533 /* TODO: Rule LB21a, as introduced by Revision 28 of UAX#14, is not
534 * yet implemented below. */
535 int brk = LINEBREAK_UNDEFINED;
536 assert(lbpCtx->lbcCur <= LBP_JT);
537 assert(lbpCtx->lbcNew <= LBP_JT);
538 switch (baTable[lbpCtx->lbcCur - 1][lbpCtx->lbcNew - 1])
539 {
540 case DIR_BRK:
541 brk = LINEBREAK_ALLOWBREAK;
542 break;
543 case CMI_BRK:
544 case IND_BRK:
545 brk = (lbpCtx->lbcLast == LBP_SP)
546 ? LINEBREAK_ALLOWBREAK
547 : LINEBREAK_NOBREAK;
548 break;
549 case CMP_BRK:
550 brk = LINEBREAK_NOBREAK;
551 if (lbpCtx->lbcLast != LBP_SP)
552 return brk; /* Do not update lbcCur */
553 break;
554 case PRH_BRK:
555 brk = LINEBREAK_NOBREAK;
556 break;
557 }
558 lbpCtx->lbcCur = lbpCtx->lbcNew;
559 return brk;
560}
561
562/**
563 * Initializes line breaking context for a given language.
564 *
565 * @param[in,out] lbpCtx pointer to the line breaking context
566 * @param[in] ch the first character to process
567 * @param[in] lang language of the input
568 * @post the line breaking context is initialized
569 */
570void lb_init_break_context(
571 struct LineBreakContext* lbpCtx,
572 utf32_t ch,
573 const char* lang)
574{
575 lbpCtx->lang = lang;
576 lbpCtx->lbpLang = get_lb_prop_lang(lang);
577 lbpCtx->lbcLast = LBP_Undefined;
578 lbpCtx->lbcNew = LBP_Undefined;
579 lbpCtx->lbcCur = resolve_lb_class(
580 get_char_lb_class_lang(ch, lbpCtx->lbpLang),
581 lbpCtx->lang);
582 treat_first_char(lbpCtx);
583}
584
585/**
586 * Updates LineBreakingContext for the next code point and returns
587 * the detected break.
588 *
589 * @param[in,out] lbpCtx pointer to the line breaking context
590 * @param[in] ch Unicode code point
591 * @return break result, one of #LINEBREAK_MUSTBREAK,
592 * #LINEBREAK_ALLOWBREAK, and #LINEBREAK_NOBREAK
593 * @post the line breaking context is updated
594 */
595int lb_process_next_char(
596 struct LineBreakContext* lbpCtx,
597 utf32_t ch )
598{
599 int brk;
600
601 lbpCtx->lbcLast = lbpCtx->lbcNew;
602 lbpCtx->lbcNew = get_char_lb_class_lang(ch, lbpCtx->lbpLang);
603 brk = get_lb_result_simple(lbpCtx);
604 switch (brk)
605 {
606 case LINEBREAK_MUSTBREAK:
607 lbpCtx->lbcCur = resolve_lb_class(lbpCtx->lbcNew, lbpCtx->lang);
608 treat_first_char(lbpCtx);
609 break;
610 case LINEBREAK_UNDEFINED:
611 lbpCtx->lbcNew = resolve_lb_class(lbpCtx->lbcNew, lbpCtx->lang);
612 brk = get_lb_result_lookup(lbpCtx);
613 break;
614 default:
615 break;
616 }
617 return brk;
618}
619
620/**
621 * Gets the next Unicode character in a UTF-8 sequence. The index will
622 * be advanced to the next complete character, unless the end of string
623 * is reached in the middle of a UTF-8 sequence.
624 *
625 * @param[in] s input UTF-8 string
626 * @param[in] len length of the string in bytes
627 * @param[in,out] ip pointer to the index
628 * @return the Unicode character beginning at the index; or
629 * #EOS if end of input is encountered
630 */
631utf32_t lb_get_next_char_utf8(
632 const utf8_t *s,
633 size_t len,
634 size_t *ip)
635{
636 utf8_t ch;
637 utf32_t res;
638
639 assert(*ip <= len);
640 if (*ip == len)
641 return EOS;
642 ch = s[*ip];
643
644 if (ch < 0xC2 || ch > 0xF4)
645 { /* One-byte sequence, tail (should not occur), or invalid */
646 *ip += 1;
647 return ch;
648 }
649 else if (ch < 0xE0)
650 { /* Two-byte sequence */
651 if (*ip + 2 > len)
652 return EOS;
653 res = ((ch & 0x1F) << 6) + (s[*ip + 1] & 0x3F);
654 *ip += 2;
655 return res;
656 }
657 else if (ch < 0xF0)
658 { /* Three-byte sequence */
659 if (*ip + 3 > len)
660 return EOS;
661 res = ((ch & 0x0F) << 12) +
662 ((s[*ip + 1] & 0x3F) << 6) +
663 ((s[*ip + 2] & 0x3F));
664 *ip += 3;
665 return res;
666 }
667 else
668 { /* Four-byte sequence */
669 if (*ip + 4 > len)
670 return EOS;
671 res = ((ch & 0x07) << 18) +
672 ((s[*ip + 1] & 0x3F) << 12) +
673 ((s[*ip + 2] & 0x3F) << 6) +
674 ((s[*ip + 3] & 0x3F));
675 *ip += 4;
676 return res;
677 }
678}
679
680/**
681 * Gets the next Unicode character in a UTF-16 sequence. The index will
682 * be advanced to the next complete character, unless the end of string
683 * is reached in the middle of a UTF-16 surrogate pair.
684 *
685 * @param[in] s input UTF-16 string
686 * @param[in] len length of the string in words
687 * @param[in,out] ip pointer to the index
688 * @return the Unicode character beginning at the index; or
689 * #EOS if end of input is encountered
690 */
691utf32_t lb_get_next_char_utf16(
692 const utf16_t *s,
693 size_t len,
694 size_t *ip)
695{
696 utf16_t ch;
697
698 assert(*ip <= len);
699 if (*ip == len)
700 return EOS;
701 ch = s[(*ip)++];
702
703 if (ch < 0xD800 || ch > 0xDBFF)
704 { /* If the character is not a high surrogate */
705 return ch;
706 }
707 if (*ip == len)
708 { /* If the input ends here (an error) */
709 --(*ip);
710 return EOS;
711 }
712 if (s[*ip] < 0xDC00 || s[*ip] > 0xDFFF)
713 { /* If the next character is not the low surrogate (an error) */
714 return ch;
715 }
716 /* Return the constructed character and advance the index again */
717 return (((utf32_t)ch & 0x3FF) << 10) + (s[(*ip)++] & 0x3FF) + 0x10000;
718}
719
720/**
721 * Gets the next Unicode character in a UTF-32 sequence. The index will
722 * be advanced to the next character.
723 *
724 * @param[in] s input UTF-32 string
725 * @param[in] len length of the string in dwords
726 * @param[in,out] ip pointer to the index
727 * @return the Unicode character beginning at the index; or
728 * #EOS if end of input is encountered
729 */
730utf32_t lb_get_next_char_utf32(
731 const utf32_t *s,
732 size_t len,
733 size_t *ip)
734{
735 assert(*ip <= len);
736 if (*ip == len)
737 return EOS;
738 return s[(*ip)++];
739}
740
741/**
742 * Sets the line breaking information for a generic input string.
743 *
744 * @param[in] s input string
745 * @param[in] len length of the input
746 * @param[in] lang language of the input
747 * @param[out] brks pointer to the output breaking data,
748 * containing #LINEBREAK_MUSTBREAK,
749 * #LINEBREAK_ALLOWBREAK, #LINEBREAK_NOBREAK,
750 * or #LINEBREAK_INSIDEACHAR
751 * @param[in] get_next_char function to get the next UTF-32 character
752 */
753void set_linebreaks(
754 const void *s,
755 size_t len,
756 const char *lang,
757 char *brks,
758 get_next_char_t get_next_char)
759{
760 utf32_t ch;
761 struct LineBreakContext lbCtx;
762 size_t posCur = 0;
763 size_t posLast = 0;
764
765 --posLast; /* To be ++'d later */
766 ch = get_next_char(s, len, &posCur);
767 if (ch == EOS)
768 return;
769 lb_init_break_context(&lbCtx, ch, lang);
770
771 /* Process a line till an explicit break or end of string */
772 for (;;)
773 {
774 for (++posLast; posLast < posCur - 1; ++posLast)
775 {
776 brks[posLast] = LINEBREAK_INSIDEACHAR;
777 }
778 assert(posLast == posCur - 1);
779 ch = get_next_char(s, len, &posCur);
780 if (ch == EOS)
781 break;
782 brks[posLast] = lb_process_next_char(&lbCtx, ch);
783 }
784
785 assert(posLast == posCur - 1 && posCur <= len);
786 /* Break after the last character */
787 brks[posLast] = LINEBREAK_MUSTBREAK;
788 /* When the input contains incomplete sequences */
789 while (posCur < len)
790 {
791 brks[posCur++] = LINEBREAK_INSIDEACHAR;
792 }
793}
794
795/**
796 * Sets the line breaking information for a UTF-8 input string.
797 *
798 * @param[in] s input UTF-8 string
799 * @param[in] len length of the input
800 * @param[in] lang language of the input
801 * @param[out] brks pointer to the output breaking data, containing
802 * #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
803 * #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
804 */
805void set_linebreaks_utf8(
806 const utf8_t *s,
807 size_t len,
808 const char *lang,
809 char *brks)
810{
811 set_linebreaks(s, len, lang, brks,
812 (get_next_char_t)lb_get_next_char_utf8);
813}
814
815/**
816 * Sets the line breaking information for a UTF-16 input string.
817 *
818 * @param[in] s input UTF-16 string
819 * @param[in] len length of the input
820 * @param[in] lang language of the input
821 * @param[out] brks pointer to the output breaking data, containing
822 * #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
823 * #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
824 */
825void set_linebreaks_utf16(
826 const utf16_t *s,
827 size_t len,
828 const char *lang,
829 char *brks)
830{
831 set_linebreaks(s, len, lang, brks,
832 (get_next_char_t)lb_get_next_char_utf16);
833}
834
835/**
836 * Sets the line breaking information for a UTF-32 input string.
837 *
838 * @param[in] s input UTF-32 string
839 * @param[in] len length of the input
840 * @param[in] lang language of the input
841 * @param[out] brks pointer to the output breaking data, containing
842 * #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
843 * #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
844 */
845void set_linebreaks_utf32(
846 const utf32_t *s,
847 size_t len,
848 const char *lang,
849 char *brks)
850{
851 set_linebreaks(s, len, lang, brks,
852 (get_next_char_t)lb_get_next_char_utf32);
853}
854
855/**
856 * Tells whether a line break can occur between two Unicode characters.
857 * This is a wrapper function to expose a simple interface. Generally
858 * speaking, it is better to use #set_linebreaks_utf32 instead, since
859 * complicated cases involving combining marks, spaces, etc. cannot be
860 * correctly processed.
861 *
862 * @param char1 the first Unicode character
863 * @param char2 the second Unicode character
864 * @param lang language of the input
865 * @return one of #LINEBREAK_MUSTBREAK, #LINEBREAK_ALLOWBREAK,
866 * #LINEBREAK_NOBREAK, or #LINEBREAK_INSIDEACHAR
867 */
868int is_line_breakable(
869 utf32_t char1,
870 utf32_t char2,
871 const char* lang)
872{
873 utf32_t s[2];
874 char brks[2];
875 s[0] = char1;
876 s[1] = char2;
877 set_linebreaks_utf32(s, 2, lang, brks);
878 return brks[0];
879}