summaryrefslogtreecommitdiff
path: root/src/static_libs/libunibreak/wordbreak.c
diff options
context:
space:
mode:
authorTom Hacohen <tom@stosb.com>2014-01-21 16:41:06 +0000
committerTom Hacohen <tom@stosb.com>2014-01-21 16:41:06 +0000
commitcff1a9a59f40b1e83ed1db8145108cae53504d4f (patch)
tree524c58e21db031ab6acd6382dfdacb9c66d91a65 /src/static_libs/libunibreak/wordbreak.c
parentcc8fa1da451d588e4218a2b8f8d3eebb9b38890f (diff)
Synced libunibreak local copy with upstream.
This fixes T805.
Diffstat (limited to 'src/static_libs/libunibreak/wordbreak.c')
-rw-r--r--src/static_libs/libunibreak/wordbreak.c674
1 files changed, 339 insertions, 335 deletions
diff --git a/src/static_libs/libunibreak/wordbreak.c b/src/static_libs/libunibreak/wordbreak.c
index f2996c0e81..e67a1f8507 100644
--- a/src/static_libs/libunibreak/wordbreak.c
+++ b/src/static_libs/libunibreak/wordbreak.c
@@ -1,10 +1,10 @@
1/* vim: set tabstop=4 shiftwidth=4: */ 1/* vim: set expandtab tabstop=4 softtabstop=4 shiftwidth=4: */
2 2
3/* 3/*
4 * Word breaking in a Unicode sequence. Designed to be used in a 4 * Word breaking in a Unicode sequence. Designed to be used in a
5 * generic text renderer. 5 * generic text renderer.
6 * 6 *
7 * Copyright (C) 2012 Tom Hacohen <tom@stosb.com> 7 * Copyright (C) 2013 Tom Hacohen <tom at stosb dot com>
8 * 8 *
9 * This software is provided 'as-is', without any express or implied 9 * This software is provided 'as-is', without any express or implied
10 * warranty. In no event will the author be held liable for any damages 10 * warranty. In no event will the author be held liable for any damages
@@ -24,24 +24,28 @@
24 * distribution. 24 * distribution.
25 * 25 *
26 * The main reference is Unicode Standard Annex 29 (UAX #29): 26 * The main reference is Unicode Standard Annex 29 (UAX #29):
27 * <URL:http://unicode.org/reports/tr29> 27 * <URL:http://unicode.org/reports/tr29>
28 * 28 *
29 * When this library was designed, this annex was at Revision 17, for 29 * When this library was designed, this annex was at Revision 17, for
30 * Unicode 6.0.0: 30 * Unicode 6.0.0:
31 * <URL:http://www.unicode.org/reports/tr29/tr29-17.html> 31 * <URL:http://www.unicode.org/reports/tr29/tr29-17.html>
32 *
33 * This library has been updated according to Revision 21, for
34 * Unicode 6.2.0:
35 * <URL:http://www.unicode.org/reports/tr29/tr29-21.html>
32 * 36 *
33 * The Unicode Terms of Use are available at 37 * The Unicode Terms of Use are available at
34 * <URL:http://www.unicode.org/copyright.html> 38 * <URL:http://www.unicode.org/copyright.html>
35 */ 39 */
36 40
37/** 41/**
38 * @file wordbreak.c 42 * @file wordbreak.c
39 * 43 *
40 * Implementation of the word breaking algorithm as described in Unicode 44 * Implementation of the word breaking algorithm as described in Unicode
41 * Standard Annex 29. 45 * Standard Annex 29.
42 * 46 *
43 * @version 2.3, 2013/05/14 47 * @version 2.4, 2013/09/28
44 * @author Tom Hacohen 48 * @author Tom Hacohen
45 */ 49 */
46 50
47#include <assert.h> 51#include <assert.h>
@@ -66,34 +70,34 @@ void init_wordbreak(void)
66/** 70/**
67 * Gets the word breaking class of a character. 71 * Gets the word breaking class of a character.
68 * 72 *
69 * @param ch character to check 73 * @param ch character to check
70 * @param wbp pointer to the wbp breaking properties array 74 * @param wbp pointer to the wbp breaking properties array
71 * @param len size of the wbp array in number of items 75 * @param len size of the wbp array in number of items
72 * @return the word breaking class if found; \c WBP_Any otherwise 76 * @return the word breaking class if found; \c WBP_Any otherwise
73 */ 77 */
74static enum WordBreakClass get_char_wb_class( 78static enum WordBreakClass get_char_wb_class(
75 utf32_t ch, 79 utf32_t ch,
76 struct WordBreakProperties *wbp, 80 struct WordBreakProperties *wbp,
77 size_t len) 81 size_t len)
78{ 82{
79 int min = 0; 83 int min = 0;
80 int max = len - 1; 84 int max = len - 1;
81 int mid; 85 int mid;
82 86
83 do 87 do
84 { 88 {
85 mid = (min + max) / 2; 89 mid = (min + max) / 2;
86 90
87 if (ch < wbp[mid].start) 91 if (ch < wbp[mid].start)
88 max = mid - 1; 92 max = mid - 1;
89 else if (ch > wbp[mid].end) 93 else if (ch > wbp[mid].end)
90 min = mid + 1; 94 min = mid + 1;
91 else 95 else
92 return wbp[mid].prop; 96 return wbp[mid].prop;
93 } 97 }
94 while (min <= max); 98 while (min <= max);
95 99
96 return WBP_Any; 100 return WBP_Any;
97} 101}
98 102
99/** 103/**
@@ -103,346 +107,346 @@ static enum WordBreakClass get_char_wb_class(
103 * Assumes \a brks is initialized - all the cells with #WORDBREAK_NOBREAK are 107 * Assumes \a brks is initialized - all the cells with #WORDBREAK_NOBREAK are
104 * cells that we really don't want to break after. 108 * cells that we really don't want to break after.
105 * 109 *
106 * @param[in] s input string 110 * @param[in] s input string
107 * @param[out] brks breaks array to fill 111 * @param[out] brks breaks array to fill
108 * @param[in] posStart start position 112 * @param[in] posStart start position
109 * @param[in] posEnd end position (exclusive) 113 * @param[in] posEnd end position (exclusive)
110 * @param[in] len length of the string 114 * @param[in] len length of the string
111 * @param[in] brkType breaks type to use 115 * @param[in] brkType breaks type to use
112 * @param[in] get_next_char function to get the next UTF-32 character 116 * @param[in] get_next_char function to get the next UTF-32 character
113 */ 117 */
114static void set_brks_to( 118static void set_brks_to(
115 const void *s, 119 const void *s,
116 char *brks, 120 char *brks,
117 size_t posStart, 121 size_t posStart,
118 size_t posEnd, 122 size_t posEnd,
119 size_t len, 123 size_t len,
120 char brkType, 124 char brkType,
121 get_next_char_t get_next_char) 125 get_next_char_t get_next_char)
122{ 126{
123 size_t posNext = posStart; 127 size_t posNext = posStart;
124 while (posNext < posEnd) 128 while (posNext < posEnd)
125 { 129 {
126 utf32_t ch; 130 utf32_t ch;
127 ch = get_next_char(s, len, &posNext); 131 ch = get_next_char(s, len, &posNext);
128 assert(ch != EOS); 132 assert(ch != EOS);
129 for (; posStart < posNext - 1; ++posStart) 133 for (; posStart < posNext - 1; ++posStart)
130 brks[posStart] = WORDBREAK_INSIDEACHAR; 134 brks[posStart] = WORDBREAK_INSIDEACHAR;
131 assert(posStart == posNext - 1); 135 assert(posStart == posNext - 1);
132 136
133 /* Only set it if we haven't set it not to break before. */ 137 /* Only set it if we haven't set it not to break before. */
134 if (brks[posStart] != WORDBREAK_NOBREAK) 138 if (brks[posStart] != WORDBREAK_NOBREAK)
135 brks[posStart] = brkType; 139 brks[posStart] = brkType;
136 posStart = posNext; 140 posStart = posNext;
137 } 141 }
138} 142}
139 143
140/* Checks to see if the class is newline, CR, or LF (rules WB3a and b). */ 144/* Checks to see if the class is newline, CR, or LF (rules WB3a and b). */
141#define IS_WB3ab(cls) ((cls == WBP_Newline) || (cls == WBP_CR) || \ 145#define IS_WB3ab(cls) ((cls == WBP_Newline) || (cls == WBP_CR) || \
142 (cls == WBP_LF)) 146 (cls == WBP_LF))
143 147
144/** 148/**
145 * Sets the word breaking information for a generic input string. 149 * Sets the word breaking information for a generic input string.
146 * 150 *
147 * @param[in] s input string 151 * @param[in] s input string
148 * @param[in] len length of the input 152 * @param[in] len length of the input
149 * @param[in] lang language of the input 153 * @param[in] lang language of the input
150 * @param[out] brks pointer to the output breaking data, containing 154 * @param[out] brks pointer to the output breaking data, containing
151 * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or 155 * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
152 * #WORDBREAK_INSIDEACHAR 156 * #WORDBREAK_INSIDEACHAR
153 * @param[in] get_next_char function to get the next UTF-32 character 157 * @param[in] get_next_char function to get the next UTF-32 character
154 */ 158 */
155static void set_wordbreaks( 159static void set_wordbreaks(
156 const void *s, 160 const void *s,
157 size_t len, 161 size_t len,
158 const char *lang, 162 const char *lang,
159 char *brks, 163 char *brks,
160 get_next_char_t get_next_char) 164 get_next_char_t get_next_char)
161{ 165{
162 enum WordBreakClass wbcLast = WBP_Undefined; 166 enum WordBreakClass wbcLast = WBP_Undefined;
163 /* wbcSeqStart is the class that started the current sequence. 167 /* wbcSeqStart is the class that started the current sequence.
164 * WBP_Undefined is a special case that means "sot". 168 * WBP_Undefined is a special case that means "sot".
165 * This value is the class that is at the start of the current rule 169 * This value is the class that is at the start of the current rule
166 * matching sequence. For example, in case of Numeric+MidNum+Numeric 170 * matching sequence. For example, in case of Numeric+MidNum+Numeric
167 * it'll be Numeric all the way. 171 * it'll be Numeric all the way.
168 */ 172 */
169 enum WordBreakClass wbcSeqStart = WBP_Undefined; 173 enum WordBreakClass wbcSeqStart = WBP_Undefined;
170 utf32_t ch; 174 utf32_t ch;
171 size_t posNext = 0; 175 size_t posNext = 0;
172 size_t posCur = 0; 176 size_t posCur = 0;
173 size_t posLast = 0; 177 size_t posLast = 0;
174 178
175 /* TODO: Language-specific specialization. */ 179 /* TODO: Language-specific specialization. */
176 (void) lang; 180 (void) lang;
177 181
178 /* Init brks. */ 182 /* Init brks. */
179 memset(brks, WORDBREAK_BREAK, len); 183 memset(brks, WORDBREAK_BREAK, len);
180 184
181 ch = get_next_char(s, len, &posNext); 185 ch = get_next_char(s, len, &posNext);
182 186
183 while (ch != EOS) 187 while (ch != EOS)
184 { 188 {
185 enum WordBreakClass wbcCur; 189 enum WordBreakClass wbcCur;
186 wbcCur = get_char_wb_class(ch, wb_prop_default, 190 wbcCur = get_char_wb_class(ch, wb_prop_default,
187 ARRAY_LEN(wb_prop_default)); 191 ARRAY_LEN(wb_prop_default));
188 192
189 switch (wbcCur) 193 switch (wbcCur)
190 { 194 {
191 case WBP_CR: 195 case WBP_CR:
192 /* WB3b */ 196 /* WB3b */
193 set_brks_to(s, brks, posLast, posCur, len, 197 set_brks_to(s, brks, posLast, posCur, len,
194 WORDBREAK_BREAK, get_next_char); 198 WORDBREAK_BREAK, get_next_char);
195 wbcSeqStart = wbcCur; 199 wbcSeqStart = wbcCur;
196 posLast = posCur; 200 posLast = posCur;
197 break; 201 break;
198 202
199 case WBP_LF: 203 case WBP_LF:
200 if (wbcSeqStart == WBP_CR) /* WB3 */ 204 if (wbcSeqStart == WBP_CR) /* WB3 */
201 { 205 {
202 set_brks_to(s, brks, posLast, posCur, len, 206 set_brks_to(s, brks, posLast, posCur, len,
203 WORDBREAK_NOBREAK, get_next_char); 207 WORDBREAK_NOBREAK, get_next_char);
204 wbcSeqStart = wbcCur; 208 wbcSeqStart = wbcCur;
205 posLast = posCur; 209 posLast = posCur;
206 break; 210 break;
207 } 211 }
208 /* Fall off */ 212 /* Fall off */
209 213
210 case WBP_Newline: 214 case WBP_Newline:
211 /* WB3a,3b */ 215 /* WB3a,3b */
212 set_brks_to(s, brks, posLast, posCur, len, 216 set_brks_to(s, brks, posLast, posCur, len,
213 WORDBREAK_BREAK, get_next_char); 217 WORDBREAK_BREAK, get_next_char);
214 wbcSeqStart = wbcCur; 218 wbcSeqStart = wbcCur;
215 posLast = posCur; 219 posLast = posCur;
216 break; 220 break;
217 221
218 case WBP_Extend: 222 case WBP_Extend:
219 case WBP_Format: 223 case WBP_Format:
220 /* WB4 - If not the first char/after a newline (WB3a,3b), skip 224 /* WB4 - If not the first char/after a newline (WB3a,3b), skip
221 * this class, set it to be the same as the prev, and mark 225 * this class, set it to be the same as the prev, and mark
222 * brks not to break before them. */ 226 * brks not to break before them. */
223 if ((wbcSeqStart == WBP_Undefined) || IS_WB3ab(wbcSeqStart)) 227 if ((wbcSeqStart == WBP_Undefined) || IS_WB3ab(wbcSeqStart))
224 { 228 {
225 set_brks_to(s, brks, posLast, posCur, len, 229 set_brks_to(s, brks, posLast, posCur, len,
226 WORDBREAK_BREAK, get_next_char); 230 WORDBREAK_BREAK, get_next_char);
227 wbcSeqStart = wbcCur; 231 wbcSeqStart = wbcCur;
228 } 232 }
229 else 233 else
230 { 234 {
231 /* It's surely not the first */ 235 /* It's surely not the first */
232 brks[posCur - 1] = WORDBREAK_NOBREAK; 236 brks[posCur - 1] = WORDBREAK_NOBREAK;
233 /* "inherit" the previous class. */ 237 /* "inherit" the previous class. */
234 wbcCur = wbcLast; 238 wbcCur = wbcLast;
235 } 239 }
236 break; 240 break;
237 241
238 case WBP_Katakana: 242 case WBP_Katakana:
239 if ((wbcSeqStart == WBP_Katakana) || /* WB13 */ 243 if ((wbcSeqStart == WBP_Katakana) || /* WB13 */
240 (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */ 244 (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */
241 { 245 {
242 set_brks_to(s, brks, posLast, posCur, len, 246 set_brks_to(s, brks, posLast, posCur, len,
243 WORDBREAK_NOBREAK, get_next_char); 247 WORDBREAK_NOBREAK, get_next_char);
244 } 248 }
245 /* No rule found, reset */ 249 /* No rule found, reset */
246 else 250 else
247 { 251 {
248 set_brks_to(s, brks, posLast, posCur, len, 252 set_brks_to(s, brks, posLast, posCur, len,
249 WORDBREAK_BREAK, get_next_char); 253 WORDBREAK_BREAK, get_next_char);
250 } 254 }
251 wbcSeqStart = wbcCur; 255 wbcSeqStart = wbcCur;
252 posLast = posCur; 256 posLast = posCur;
253 break; 257 break;
254 258
255 case WBP_ALetter: 259 case WBP_ALetter:
256 if ((wbcSeqStart == WBP_ALetter) || /* WB5,6,7 */ 260 if ((wbcSeqStart == WBP_ALetter) || /* WB5,6,7 */
257 (wbcLast == WBP_Numeric) || /* WB10 */ 261 (wbcLast == WBP_Numeric) || /* WB10 */
258 (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */ 262 (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */
259 { 263 {
260 set_brks_to(s, brks, posLast, posCur, len, 264 set_brks_to(s, brks, posLast, posCur, len,
261 WORDBREAK_NOBREAK, get_next_char); 265 WORDBREAK_NOBREAK, get_next_char);
262 } 266 }
263 /* No rule found, reset */ 267 /* No rule found, reset */
264 else 268 else
265 { 269 {
266 set_brks_to(s, brks, posLast, posCur, len, 270 set_brks_to(s, brks, posLast, posCur, len,
267 WORDBREAK_BREAK, get_next_char); 271 WORDBREAK_BREAK, get_next_char);
268 } 272 }
269 wbcSeqStart = wbcCur; 273 wbcSeqStart = wbcCur;
270 posLast = posCur; 274 posLast = posCur;
271 break; 275 break;
272 276
273 case WBP_MidNumLet: 277 case WBP_MidNumLet:
274 if ((wbcLast == WBP_ALetter) || /* WB6,7 */ 278 if ((wbcLast == WBP_ALetter) || /* WB6,7 */
275 (wbcLast == WBP_Numeric)) /* WB11,12 */ 279 (wbcLast == WBP_Numeric)) /* WB11,12 */
276 { 280 {
277 /* Go on */ 281 /* Go on */
278 } 282 }
279 else 283 else
280 { 284 {
281 set_brks_to(s, brks, posLast, posCur, len, 285 set_brks_to(s, brks, posLast, posCur, len,
282 WORDBREAK_BREAK, get_next_char); 286 WORDBREAK_BREAK, get_next_char);
283 wbcSeqStart = wbcCur; 287 wbcSeqStart = wbcCur;
284 posLast = posCur; 288 posLast = posCur;
285 } 289 }
286 break; 290 break;
287 291
288 case WBP_MidLetter: 292 case WBP_MidLetter:
289 if (wbcLast == WBP_ALetter) /* WB6,7 */ 293 if (wbcLast == WBP_ALetter) /* WB6,7 */
290 { 294 {
291 /* Go on */ 295 /* Go on */
292 } 296 }
293 else 297 else
294 { 298 {
295 set_brks_to(s, brks, posLast, posCur, len, 299 set_brks_to(s, brks, posLast, posCur, len,
296 WORDBREAK_BREAK, get_next_char); 300 WORDBREAK_BREAK, get_next_char);
297 wbcSeqStart = wbcCur; 301 wbcSeqStart = wbcCur;
298 posLast = posCur; 302 posLast = posCur;
299 } 303 }
300 break; 304 break;
301 305
302 case WBP_MidNum: 306 case WBP_MidNum:
303 if (wbcLast == WBP_Numeric) /* WB11,12 */ 307 if (wbcLast == WBP_Numeric) /* WB11,12 */
304 { 308 {
305 /* Go on */ 309 /* Go on */
306 } 310 }
307 else 311 else
308 { 312 {
309 set_brks_to(s, brks, posLast, posCur, len, 313 set_brks_to(s, brks, posLast, posCur, len,
310 WORDBREAK_BREAK, get_next_char); 314 WORDBREAK_BREAK, get_next_char);
311 wbcSeqStart = wbcCur; 315 wbcSeqStart = wbcCur;
312 posLast = posCur; 316 posLast = posCur;
313 } 317 }
314 break; 318 break;
315 319
316 case WBP_Numeric: 320 case WBP_Numeric:
317 if ((wbcSeqStart == WBP_Numeric) || /* WB8,11,12 */ 321 if ((wbcSeqStart == WBP_Numeric) || /* WB8,11,12 */
318 (wbcLast == WBP_ALetter) || /* WB9 */ 322 (wbcLast == WBP_ALetter) || /* WB9 */
319 (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */ 323 (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */
320 { 324 {
321 set_brks_to(s, brks, posLast, posCur, len, 325 set_brks_to(s, brks, posLast, posCur, len,
322 WORDBREAK_NOBREAK, get_next_char); 326 WORDBREAK_NOBREAK, get_next_char);
323 } 327 }
324 /* No rule found, reset */ 328 /* No rule found, reset */
325 else 329 else
326 { 330 {
327 set_brks_to(s, brks, posLast, posCur, len, 331 set_brks_to(s, brks, posLast, posCur, len,
328 WORDBREAK_BREAK, get_next_char); 332 WORDBREAK_BREAK, get_next_char);
329 } 333 }
330 wbcSeqStart = wbcCur; 334 wbcSeqStart = wbcCur;
331 posLast = posCur; 335 posLast = posCur;
332 break; 336 break;
333 337
334 case WBP_ExtendNumLet: 338 case WBP_ExtendNumLet:
335 /* WB13a,13b */ 339 /* WB13a,13b */
336 if ((wbcSeqStart == wbcLast) && 340 if ((wbcSeqStart == wbcLast) &&
337 ((wbcLast == WBP_ALetter) || 341 ((wbcLast == WBP_ALetter) ||
338 (wbcLast == WBP_Numeric) || 342 (wbcLast == WBP_Numeric) ||
339 (wbcLast == WBP_Katakana) || 343 (wbcLast == WBP_Katakana) ||
340 (wbcLast == WBP_ExtendNumLet))) 344 (wbcLast == WBP_ExtendNumLet)))
341 { 345 {
342 set_brks_to(s, brks, posLast, posCur, len, 346 set_brks_to(s, brks, posLast, posCur, len,
343 WORDBREAK_NOBREAK, get_next_char); 347 WORDBREAK_NOBREAK, get_next_char);
344 } 348 }
345 /* No rule found, reset */ 349 /* No rule found, reset */
346 else 350 else
347 { 351 {
348 set_brks_to(s, brks, posLast, posCur, len, 352 set_brks_to(s, brks, posLast, posCur, len,
349 WORDBREAK_BREAK, get_next_char); 353 WORDBREAK_BREAK, get_next_char);
350 } 354 }
351 wbcSeqStart = wbcCur; 355 wbcSeqStart = wbcCur;
352 posLast = posCur; 356 posLast = posCur;
353 break; 357 break;
354 358
355 case WBP_Regional: 359 case WBP_Regional:
356 /* WB13c */ 360 /* WB13c */
357 if (wbcSeqStart == WBP_Regional) 361 if (wbcSeqStart == WBP_Regional)
358 { 362 {
359 set_brks_to(s, brks, posLast, posCur, len, 363 set_brks_to(s, brks, posLast, posCur, len,
360 WORDBREAK_NOBREAK, get_next_char); 364 WORDBREAK_NOBREAK, get_next_char);
361 } 365 }
362 wbcSeqStart = wbcCur; 366 wbcSeqStart = wbcCur;
363 posLast = posCur; 367 posLast = posCur;
364 break; 368 break;
365 369
366 case WBP_Any: 370 case WBP_Any:
367 /* Allow breaks and reset */ 371 /* Allow breaks and reset */
368 set_brks_to(s, brks, posLast, posCur, len, 372 set_brks_to(s, brks, posLast, posCur, len,
369 WORDBREAK_BREAK, get_next_char); 373 WORDBREAK_BREAK, get_next_char);
370 wbcSeqStart = wbcCur; 374 wbcSeqStart = wbcCur;
371 posLast = posCur; 375 posLast = posCur;
372 break; 376 break;
373 377
374 default: 378 default:
375 /* Error, should never get here! */ 379 /* Error, should never get here! */
376 assert(0); 380 assert(0);
377 break; 381 break;
378 } 382 }
379 383
380 wbcLast = wbcCur; 384 wbcLast = wbcCur;
381 posCur = posNext; 385 posCur = posNext;
382 ch = get_next_char(s, len, &posNext); 386 ch = get_next_char(s, len, &posNext);
383 } 387 }
384 388
385 /* WB2 */ 389 /* WB2 */
386 set_brks_to(s, brks, posLast, posNext, len, 390 set_brks_to(s, brks, posLast, posNext, len,
387 WORDBREAK_BREAK, get_next_char); 391 WORDBREAK_BREAK, get_next_char);
388} 392}
389 393
390/** 394/**
391 * Sets the word breaking information for a UTF-8 input string. 395 * Sets the word breaking information for a UTF-8 input string.
392 * 396 *
393 * @param[in] s input UTF-8 string 397 * @param[in] s input UTF-8 string
394 * @param[in] len length of the input 398 * @param[in] len length of the input
395 * @param[in] lang language of the input 399 * @param[in] lang language of the input
396 * @param[out] brks pointer to the output breaking data, containing 400 * @param[out] brks pointer to the output breaking data, containing
397 * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or 401 * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
398 * #WORDBREAK_INSIDEACHAR 402 * #WORDBREAK_INSIDEACHAR
399 */ 403 */
400void set_wordbreaks_utf8( 404void set_wordbreaks_utf8(
401 const utf8_t *s, 405 const utf8_t *s,
402 size_t len, 406 size_t len,
403 const char *lang, 407 const char *lang,
404 char *brks) 408 char *brks)
405{ 409{
406 set_wordbreaks(s, len, lang, brks, 410 set_wordbreaks(s, len, lang, brks,
407 (get_next_char_t)lb_get_next_char_utf8); 411 (get_next_char_t)lb_get_next_char_utf8);
408} 412}
409 413
410/** 414/**
411 * Sets the word breaking information for a UTF-16 input string. 415 * Sets the word breaking information for a UTF-16 input string.
412 * 416 *
413 * @param[in] s input UTF-16 string 417 * @param[in] s input UTF-16 string
414 * @param[in] len length of the input 418 * @param[in] len length of the input
415 * @param[in] lang language of the input 419 * @param[in] lang language of the input
416 * @param[out] brks pointer to the output breaking data, containing 420 * @param[out] brks pointer to the output breaking data, containing
417 * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or 421 * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
418 * #WORDBREAK_INSIDEACHAR 422 * #WORDBREAK_INSIDEACHAR
419 */ 423 */
420void set_wordbreaks_utf16( 424void set_wordbreaks_utf16(
421 const utf16_t *s, 425 const utf16_t *s,
422 size_t len, 426 size_t len,
423 const char *lang, 427 const char *lang,
424 char *brks) 428 char *brks)
425{ 429{
426 set_wordbreaks(s, len, lang, brks, 430 set_wordbreaks(s, len, lang, brks,
427 (get_next_char_t)lb_get_next_char_utf16); 431 (get_next_char_t)lb_get_next_char_utf16);
428} 432}
429 433
430/** 434/**
431 * Sets the word breaking information for a UTF-32 input string. 435 * Sets the word breaking information for a UTF-32 input string.
432 * 436 *
433 * @param[in] s input UTF-32 string 437 * @param[in] s input UTF-32 string
434 * @param[in] len length of the input 438 * @param[in] len length of the input
435 * @param[in] lang language of the input 439 * @param[in] lang language of the input
436 * @param[out] brks pointer to the output breaking data, containing 440 * @param[out] brks pointer to the output breaking data, containing
437 * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or 441 * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
438 * #WORDBREAK_INSIDEACHAR 442 * #WORDBREAK_INSIDEACHAR
439 */ 443 */
440void set_wordbreaks_utf32( 444void set_wordbreaks_utf32(
441 const utf32_t *s, 445 const utf32_t *s,
442 size_t len, 446 size_t len,
443 const char *lang, 447 const char *lang,
444 char *brks) 448 char *brks)
445{ 449{
446 set_wordbreaks(s, len, lang, brks, 450 set_wordbreaks(s, len, lang, brks,
447 (get_next_char_t)lb_get_next_char_utf32); 451 (get_next_char_t)lb_get_next_char_utf32);
448} 452}