summaryrefslogtreecommitdiff
path: root/src/static_libs/libunibreak/wordbreak.c
diff options
context:
space:
mode:
authorTom Hacohen <tom@stosb.com>2013-01-11 18:16:09 +0000
committerTom Hacohen <tom@stosb.com>2013-01-11 18:16:09 +0000
commit88ab486e6332ffe3d789e2435ee9a78751f88bf0 (patch)
tree2a2a13369b92122ff157952cb7359f8a4e95a42e /src/static_libs/libunibreak/wordbreak.c
parentd83b83e10befda5a8065069a3152846decf91aa3 (diff)
Efl static_libs: Updated liblinebreak -> libunibreak.
SVN revision: 82652
Diffstat (limited to 'src/static_libs/libunibreak/wordbreak.c')
-rw-r--r--src/static_libs/libunibreak/wordbreak.c437
1 files changed, 437 insertions, 0 deletions
diff --git a/src/static_libs/libunibreak/wordbreak.c b/src/static_libs/libunibreak/wordbreak.c
new file mode 100644
index 0000000000..60db99e426
--- /dev/null
+++ b/src/static_libs/libunibreak/wordbreak.c
@@ -0,0 +1,437 @@
1/* vim: set tabstop=4 shiftwidth=4: */
2
3/*
4 * Word breaking in a Unicode sequence. Designed to be used in a
5 * generic text renderer.
6 *
7 * Copyright (C) 2012 Tom Hacohen <tom@stosb.com>
8 *
9 * This software is provided 'as-is', without any express or implied
10 * warranty. In no event will the author be held liable for any damages
11 * arising from the use of this software.
12 *
13 * Permission is granted to anyone to use this software for any purpose,
14 * including commercial applications, and to alter it and redistribute
15 * it freely, subject to the following restrictions:
16 *
17 * 1. The origin of this software must not be misrepresented; you must
18 * not claim that you wrote the original software. If you use this
19 * software in a product, an acknowledgement in the product
20 * documentation would be appreciated but is not required.
21 * 2. Altered source versions must be plainly marked as such, and must
22 * not be misrepresented as being the original software.
23 * 3. This notice may not be removed or altered from any source
24 * distribution.
25 *
26 * The main reference is Unicode Standard Annex 29 (UAX #29):
27 * <URL:http://unicode.org/reports/tr29>
28 *
29 * When this library was designed, this annex was at Revision 17, for
30 * Unicode 6.0.0:
31 * <URL:http://www.unicode.org/reports/tr29/tr29-17.html>
32 *
33 * The Unicode Terms of Use are available at
34 * <URL:http://www.unicode.org/copyright.html>
35 */
36
37/**
38 * @file wordbreak.c
39 *
40 * Implementation of the word breaking algorithm as described in Unicode
41 * Standard Annex 29.
42 *
43 * @version 2.2, 2012/02/04
44 * @author Tom Hacohen
45 */
46
47#include <assert.h>
48#include <stddef.h>
49#include <string.h>
50#include "linebreak.h"
51#include "linebreakdef.h"
52
53#include "wordbreak.h"
54#include "wordbreakdata.c"
55
56#define ARRAY_LEN(x) (sizeof(x) / sizeof(x[0]))
57
58/**
59 * Initializes the wordbreak internals. It currently does nothing, but
60 * it may in the future.
61 */
62void init_wordbreak(void)
63{
64}
65
66/**
67 * Gets the word breaking class of a character.
68 *
69 * @param ch character to check
70 * @param wbp pointer to the wbp breaking properties array
71 * @param len size of the wbp array in number of items
72 * @return the word breaking class if found; \c WBP_Any otherwise
73 */
74static enum WordBreakClass get_char_wb_class(
75 utf32_t ch,
76 struct WordBreakProperties *wbp,
77 size_t len)
78{
79 int min = 0;
80 int max = len - 1;
81 int mid;
82
83 do
84 {
85 mid = (min + max) / 2;
86
87 if (ch < wbp[mid].start)
88 max = mid - 1;
89 else if (ch > wbp[mid].end)
90 min = mid + 1;
91 else
92 return wbp[mid].prop;
93 }
94 while (min <= max);
95
96 return WBP_Any;
97}
98
99/**
100 * Sets the word break types to a specific value in a range.
101 *
102 * It sets the inside chars to #WORDBREAK_INSIDEACHAR and the rest to brkType.
103 * Assumes \a brks is initialized - all the cells with #WORDBREAK_NOBREAK are
104 * cells that we really don't want to break after.
105 *
106 * @param[in] s input string
107 * @param[out] brks breaks array to fill
108 * @param[in] posStart start position
109 * @param[in] posEnd end position (exclusive)
110 * @param[in] len length of the string
111 * @param[in] brkType breaks type to use
112 * @param[in] get_next_char function to get the next UTF-32 character
113 */
114static void set_brks_to(
115 const void *s,
116 char *brks,
117 size_t posStart,
118 size_t posEnd,
119 size_t len,
120 char brkType,
121 get_next_char_t get_next_char)
122{
123 size_t posNext = posStart;
124 while (posNext < posEnd)
125 {
126 utf32_t ch;
127 ch = get_next_char(s, len, &posNext);
128 assert(ch != EOS);
129 for (; posStart < posNext - 1; ++posStart)
130 brks[posStart] = WORDBREAK_INSIDEACHAR;
131 assert(posStart == posNext - 1);
132
133 /* Only set it if we haven't set it not to break before. */
134 if (brks[posStart] != WORDBREAK_NOBREAK)
135 brks[posStart] = brkType;
136 posStart = posNext;
137 }
138}
139
140/* Checks to see if the class is newline, CR, or LF (rules WB3a and b). */
141#define IS_WB3ab(cls) ((cls == WBP_Newline) || (cls == WBP_CR) || \
142 (cls == WBP_LF))
143
144/**
145 * Sets the word breaking information for a generic input string.
146 *
147 * @param[in] s input string
148 * @param[in] len length of the input
149 * @param[in] lang language of the input
150 * @param[out] brks pointer to the output breaking data, containing
151 * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
152 * #WORDBREAK_INSIDEACHAR
153 * @param[in] get_next_char function to get the next UTF-32 character
154 */
155static void set_wordbreaks(
156 const void *s,
157 size_t len,
158 const char *lang,
159 char *brks,
160 get_next_char_t get_next_char)
161{
162 enum WordBreakClass wbcLast = WBP_Undefined;
163 /* wbcSeqStart is the class that started the current sequence.
164 * WBP_Undefined is a special case that means "sot".
165 * This value is the class that is at the start of the current rule
166 * matching sequence. For example, in case of Numeric+MidNum+Numeric
167 * it'll be Numeric all the way.
168 */
169 enum WordBreakClass wbcSeqStart = WBP_Undefined;
170 utf32_t ch;
171 size_t posNext = 0;
172 size_t posCur = 0;
173 size_t posLast = 0;
174
175 /* TODO: Language-specific specialization. */
176 (void) lang;
177
178 /* Init brks. */
179 memset(brks, WORDBREAK_BREAK, len);
180
181 ch = get_next_char(s, len, &posNext);
182
183 while (ch != EOS)
184 {
185 enum WordBreakClass wbcCur;
186 wbcCur = get_char_wb_class(ch, wb_prop_default,
187 ARRAY_LEN(wb_prop_default));
188
189 switch (wbcCur)
190 {
191 case WBP_CR:
192 /* WB3b */
193 set_brks_to(s, brks, posLast, posCur, len,
194 WORDBREAK_BREAK, get_next_char);
195 wbcSeqStart = wbcCur;
196 posLast = posCur;
197 break;
198
199 case WBP_LF:
200 if (wbcSeqStart == WBP_CR) /* WB3 */
201 {
202 set_brks_to(s, brks, posLast, posCur, len,
203 WORDBREAK_NOBREAK, get_next_char);
204 wbcSeqStart = wbcCur;
205 posLast = posCur;
206 break;
207 }
208 /* Fall off */
209
210 case WBP_Newline:
211 /* WB3a,3b */
212 set_brks_to(s, brks, posLast, posCur, len,
213 WORDBREAK_BREAK, get_next_char);
214 wbcSeqStart = wbcCur;
215 posLast = posCur;
216 break;
217
218 case WBP_Extend:
219 case WBP_Format:
220 /* WB4 - If not the first char/after a newline (WB3a,3b), skip
221 * this class, set it to be the same as the prev, and mark
222 * brks not to break before them. */
223 if ((wbcSeqStart == WBP_Undefined) || IS_WB3ab(wbcSeqStart))
224 {
225 set_brks_to(s, brks, posLast, posCur, len,
226 WORDBREAK_BREAK, get_next_char);
227 wbcSeqStart = wbcCur;
228 }
229 else
230 {
231 /* It's surely not the first */
232 brks[posCur - 1] = WORDBREAK_NOBREAK;
233 /* "inherit" the previous class. */
234 wbcCur = wbcLast;
235 }
236 break;
237
238 case WBP_Katakana:
239 if ((wbcSeqStart == WBP_Katakana) || /* WB13 */
240 (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */
241 {
242 set_brks_to(s, brks, posLast, posCur, len,
243 WORDBREAK_NOBREAK, get_next_char);
244 }
245 /* No rule found, reset */
246 else
247 {
248 set_brks_to(s, brks, posLast, posCur, len,
249 WORDBREAK_BREAK, get_next_char);
250 }
251 wbcSeqStart = wbcCur;
252 posLast = posCur;
253 break;
254
255 case WBP_ALetter:
256 if ((wbcSeqStart == WBP_ALetter) || /* WB5,6,7 */
257 (wbcLast == WBP_Numeric) || /* WB10 */
258 (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */
259 {
260 set_brks_to(s, brks, posLast, posCur, len,
261 WORDBREAK_NOBREAK, get_next_char);
262 }
263 /* No rule found, reset */
264 else
265 {
266 set_brks_to(s, brks, posLast, posCur, len,
267 WORDBREAK_BREAK, get_next_char);
268 }
269 wbcSeqStart = wbcCur;
270 posLast = posCur;
271 break;
272
273 case WBP_MidNumLet:
274 if ((wbcLast == WBP_ALetter) || /* WB6,7 */
275 (wbcLast == WBP_Numeric)) /* WB11,12 */
276 {
277 /* Go on */
278 }
279 else
280 {
281 set_brks_to(s, brks, posLast, posCur, len,
282 WORDBREAK_BREAK, get_next_char);
283 wbcSeqStart = wbcCur;
284 posLast = posCur;
285 }
286 break;
287
288 case WBP_MidLetter:
289 if (wbcLast == WBP_ALetter) /* WB6,7 */
290 {
291 /* Go on */
292 }
293 else
294 {
295 set_brks_to(s, brks, posLast, posCur, len,
296 WORDBREAK_BREAK, get_next_char);
297 wbcSeqStart = wbcCur;
298 posLast = posCur;
299 }
300 break;
301
302 case WBP_MidNum:
303 if (wbcLast == WBP_Numeric) /* WB11,12 */
304 {
305 /* Go on */
306 }
307 else
308 {
309 set_brks_to(s, brks, posLast, posCur, len,
310 WORDBREAK_BREAK, get_next_char);
311 wbcSeqStart = wbcCur;
312 posLast = posCur;
313 }
314 break;
315
316 case WBP_Numeric:
317 if ((wbcSeqStart == WBP_Numeric) || /* WB8,11,12 */
318 (wbcLast == WBP_ALetter) || /* WB9 */
319 (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */
320 {
321 set_brks_to(s, brks, posLast, posCur, len,
322 WORDBREAK_NOBREAK, get_next_char);
323 }
324 /* No rule found, reset */
325 else
326 {
327 set_brks_to(s, brks, posLast, posCur, len,
328 WORDBREAK_BREAK, get_next_char);
329 }
330 wbcSeqStart = wbcCur;
331 posLast = posCur;
332 break;
333
334 case WBP_ExtendNumLet:
335 /* WB13a,13b */
336 if ((wbcSeqStart == wbcLast) &&
337 ((wbcLast == WBP_ALetter) ||
338 (wbcLast == WBP_Numeric) ||
339 (wbcLast == WBP_Katakana) ||
340 (wbcLast == WBP_ExtendNumLet)))
341 {
342 set_brks_to(s, brks, posLast, posCur, len,
343 WORDBREAK_NOBREAK, get_next_char);
344 }
345 /* No rule found, reset */
346 else
347 {
348 set_brks_to(s, brks, posLast, posCur, len,
349 WORDBREAK_BREAK, get_next_char);
350 }
351 wbcSeqStart = wbcCur;
352 posLast = posCur;
353 break;
354
355 case WBP_Any:
356 /* Allow breaks and reset */
357 set_brks_to(s, brks, posLast, posCur, len,
358 WORDBREAK_BREAK, get_next_char);
359 wbcSeqStart = wbcCur;
360 posLast = posCur;
361 break;
362
363 default:
364 /* Error, should never get here! */
365 assert(0);
366 break;
367 }
368
369 wbcLast = wbcCur;
370 posCur = posNext;
371 ch = get_next_char(s, len, &posNext);
372 }
373
374 /* WB2 */
375 set_brks_to(s, brks, posLast, posNext, len,
376 WORDBREAK_BREAK, get_next_char);
377}
378
379/**
380 * Sets the word breaking information for a UTF-8 input string.
381 *
382 * @param[in] s input UTF-8 string
383 * @param[in] len length of the input
384 * @param[in] lang language of the input
385 * @param[out] brks pointer to the output breaking data, containing
386 * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
387 * #WORDBREAK_INSIDEACHAR
388 */
389void set_wordbreaks_utf8(
390 const utf8_t *s,
391 size_t len,
392 const char *lang,
393 char *brks)
394{
395 set_wordbreaks(s, len, lang, brks,
396 (get_next_char_t)lb_get_next_char_utf8);
397}
398
399/**
400 * Sets the word breaking information for a UTF-16 input string.
401 *
402 * @param[in] s input UTF-16 string
403 * @param[in] len length of the input
404 * @param[in] lang language of the input
405 * @param[out] brks pointer to the output breaking data, containing
406 * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
407 * #WORDBREAK_INSIDEACHAR
408 */
409void set_wordbreaks_utf16(
410 const utf16_t *s,
411 size_t len,
412 const char *lang,
413 char *brks)
414{
415 set_wordbreaks(s, len, lang, brks,
416 (get_next_char_t)lb_get_next_char_utf16);
417}
418
419/**
420 * Sets the word breaking information for a UTF-32 input string.
421 *
422 * @param[in] s input UTF-32 string
423 * @param[in] len length of the input
424 * @param[in] lang language of the input
425 * @param[out] brks pointer to the output breaking data, containing
426 * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
427 * #WORDBREAK_INSIDEACHAR
428 */
429void set_wordbreaks_utf32(
430 const utf32_t *s,
431 size_t len,
432 const char *lang,
433 char *brks)
434{
435 set_wordbreaks(s, len, lang, brks,
436 (get_next_char_t)lb_get_next_char_utf32);
437}