summaryrefslogtreecommitdiff
path: root/src/static_libs/libunibreak/wordbreak.c
diff options
context:
space:
mode:
authorTom Hacohen <tom@stosb.com>2015-05-07 10:02:40 +0100
committerTom Hacohen <tom@stosb.com>2015-05-07 10:03:26 +0100
commita2a9f33802a3923c1469789f66d5fdab1eaea943 (patch)
tree8c114ee6b77a5877dff96a62f90b2f9b23d50f2c /src/static_libs/libunibreak/wordbreak.c
parent92ff90ecca98f9e8e66a1f7a3ecf4e46f65913d4 (diff)
Static deps: Move unibreak to be an external dep.
We need any version of libunibreak. The first one has been released in mid 2012. Even slow distros like ubuntu already have an LTS out with a good enough version, so I consider this enough to remove the maintenance cost. This has been discussed on IRC. @feature
Diffstat (limited to 'src/static_libs/libunibreak/wordbreak.c')
-rw-r--r--src/static_libs/libunibreak/wordbreak.c453
1 files changed, 0 insertions, 453 deletions
diff --git a/src/static_libs/libunibreak/wordbreak.c b/src/static_libs/libunibreak/wordbreak.c
deleted file mode 100644
index 5c1e3d0e79..0000000000
--- a/src/static_libs/libunibreak/wordbreak.c
+++ /dev/null
@@ -1,453 +0,0 @@
1/* vim: set expandtab tabstop=4 softtabstop=4 shiftwidth=4: */
2
3/*
4 * Word breaking in a Unicode sequence. Designed to be used in a
5 * generic text renderer.
6 *
7 * Copyright (C) 2013 Tom Hacohen <tom at stosb dot com>
8 *
9 * This software is provided 'as-is', without any express or implied
10 * warranty. In no event will the author be held liable for any damages
11 * arising from the use of this software.
12 *
13 * Permission is granted to anyone to use this software for any purpose,
14 * including commercial applications, and to alter it and redistribute
15 * it freely, subject to the following restrictions:
16 *
17 * 1. The origin of this software must not be misrepresented; you must
18 * not claim that you wrote the original software. If you use this
19 * software in a product, an acknowledgement in the product
20 * documentation would be appreciated but is not required.
21 * 2. Altered source versions must be plainly marked as such, and must
22 * not be misrepresented as being the original software.
23 * 3. This notice may not be removed or altered from any source
24 * distribution.
25 *
26 * The main reference is Unicode Standard Annex 29 (UAX #29):
27 * <URL:http://unicode.org/reports/tr29>
28 *
29 * When this library was designed, this annex was at Revision 17, for
30 * Unicode 6.0.0:
31 * <URL:http://www.unicode.org/reports/tr29/tr29-17.html>
32 *
33 * This library has been updated according to Revision 21, for
34 * Unicode 6.2.0:
35 * <URL:http://www.unicode.org/reports/tr29/tr29-21.html>
36 *
37 * The Unicode Terms of Use are available at
38 * <URL:http://www.unicode.org/copyright.html>
39 */
40
41/**
42 * @file wordbreak.c
43 *
44 * Implementation of the word breaking algorithm as described in Unicode
45 * Standard Annex 29.
46 *
47 * @version 2.4, 2013/09/28
48 * @author Tom Hacohen
49 */
50
51#include <assert.h>
52#include <stddef.h>
53#include <string.h>
54#include "linebreak.h"
55#include "linebreakdef.h"
56
57#include "wordbreak.h"
58#include "wordbreakdata.c"
59
60#define ARRAY_LEN(x) (sizeof(x) / sizeof(x[0]))
61
62/**
63 * Initializes the wordbreak internals. It currently does nothing, but
64 * it may in the future.
65 */
66void init_wordbreak(void)
67{
68}
69
70/**
71 * Gets the word breaking class of a character.
72 *
73 * @param ch character to check
74 * @param wbp pointer to the wbp breaking properties array
75 * @param len size of the wbp array in number of items
76 * @return the word breaking class if found; \c WBP_Any otherwise
77 */
78static enum WordBreakClass get_char_wb_class(
79 utf32_t ch,
80 struct WordBreakProperties *wbp,
81 size_t len)
82{
83 int min = 0;
84 int max = len - 1;
85 int mid;
86
87 do
88 {
89 mid = (min + max) / 2;
90
91 if (ch < wbp[mid].start)
92 max = mid - 1;
93 else if (ch > wbp[mid].end)
94 min = mid + 1;
95 else
96 return wbp[mid].prop;
97 }
98 while (min <= max);
99
100 return WBP_Any;
101}
102
103/**
104 * Sets the word break types to a specific value in a range.
105 *
106 * It sets the inside chars to #WORDBREAK_INSIDEACHAR and the rest to brkType.
107 * Assumes \a brks is initialized - all the cells with #WORDBREAK_NOBREAK are
108 * cells that we really don't want to break after.
109 *
110 * @param[in] s input string
111 * @param[out] brks breaks array to fill
112 * @param[in] posStart start position
113 * @param[in] posEnd end position (exclusive)
114 * @param[in] len length of the string
115 * @param[in] brkType breaks type to use
116 * @param[in] get_next_char function to get the next UTF-32 character
117 */
118static void set_brks_to(
119 const void *s,
120 char *brks,
121 size_t posStart,
122 size_t posEnd,
123 size_t len,
124 char brkType,
125 get_next_char_t get_next_char)
126{
127 size_t posNext = posStart;
128 while (posNext < posEnd)
129 {
130 utf32_t ch;
131 (void)ch;
132 ch = get_next_char(s, len, &posNext);
133 assert(ch != EOS);
134 for (; posStart < posNext - 1; ++posStart)
135 brks[posStart] = WORDBREAK_INSIDEACHAR;
136 assert(posStart == posNext - 1);
137
138 /* Only set it if we haven't set it not to break before. */
139 if (brks[posStart] != WORDBREAK_NOBREAK)
140 brks[posStart] = brkType;
141 posStart = posNext;
142 }
143}
144
145/* Checks to see if the class is newline, CR, or LF (rules WB3a and b). */
146#define IS_WB3ab(cls) ((cls == WBP_Newline) || (cls == WBP_CR) || \
147 (cls == WBP_LF))
148
149/**
150 * Sets the word breaking information for a generic input string.
151 *
152 * @param[in] s input string
153 * @param[in] len length of the input
154 * @param[in] lang language of the input
155 * @param[out] brks pointer to the output breaking data, containing
156 * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
157 * #WORDBREAK_INSIDEACHAR
158 * @param[in] get_next_char function to get the next UTF-32 character
159 */
160static void set_wordbreaks(
161 const void *s,
162 size_t len,
163 const char *lang,
164 char *brks,
165 get_next_char_t get_next_char)
166{
167 enum WordBreakClass wbcLast = WBP_Undefined;
168 /* wbcSeqStart is the class that started the current sequence.
169 * WBP_Undefined is a special case that means "sot".
170 * This value is the class that is at the start of the current rule
171 * matching sequence. For example, in case of Numeric+MidNum+Numeric
172 * it'll be Numeric all the way.
173 */
174 enum WordBreakClass wbcSeqStart = WBP_Undefined;
175 utf32_t ch;
176 size_t posNext = 0;
177 size_t posCur = 0;
178 size_t posLast = 0;
179
180 /* TODO: Language-specific specialization. */
181 (void) lang;
182
183 /* Init brks. */
184 memset(brks, WORDBREAK_BREAK, len);
185
186 ch = get_next_char(s, len, &posNext);
187
188 while (ch != EOS)
189 {
190 enum WordBreakClass wbcCur;
191 wbcCur = get_char_wb_class(ch, wb_prop_default,
192 ARRAY_LEN(wb_prop_default));
193
194 switch (wbcCur)
195 {
196 case WBP_CR:
197 /* WB3b */
198 set_brks_to(s, brks, posLast, posCur, len,
199 WORDBREAK_BREAK, get_next_char);
200 wbcSeqStart = wbcCur;
201 posLast = posCur;
202 break;
203
204 case WBP_LF:
205 if (wbcSeqStart == WBP_CR) /* WB3 */
206 {
207 set_brks_to(s, brks, posLast, posCur, len,
208 WORDBREAK_NOBREAK, get_next_char);
209 wbcSeqStart = wbcCur;
210 posLast = posCur;
211 break;
212 }
213 /* Fall off */
214
215 case WBP_Newline:
216 /* WB3a,3b */
217 set_brks_to(s, brks, posLast, posCur, len,
218 WORDBREAK_BREAK, get_next_char);
219 wbcSeqStart = wbcCur;
220 posLast = posCur;
221 break;
222
223 case WBP_Extend:
224 case WBP_Format:
225 /* WB4 - If not the first char/after a newline (WB3a,3b), skip
226 * this class, set it to be the same as the prev, and mark
227 * brks not to break before them. */
228 if ((wbcSeqStart == WBP_Undefined) || IS_WB3ab(wbcSeqStart))
229 {
230 set_brks_to(s, brks, posLast, posCur, len,
231 WORDBREAK_BREAK, get_next_char);
232 wbcSeqStart = wbcCur;
233 }
234 else
235 {
236 /* It's surely not the first */
237 brks[posCur - 1] = WORDBREAK_NOBREAK;
238 /* "inherit" the previous class. */
239 wbcCur = wbcLast;
240 }
241 break;
242
243 case WBP_Katakana:
244 if ((wbcSeqStart == WBP_Katakana) || /* WB13 */
245 (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */
246 {
247 set_brks_to(s, brks, posLast, posCur, len,
248 WORDBREAK_NOBREAK, get_next_char);
249 }
250 /* No rule found, reset */
251 else
252 {
253 set_brks_to(s, brks, posLast, posCur, len,
254 WORDBREAK_BREAK, get_next_char);
255 }
256 wbcSeqStart = wbcCur;
257 posLast = posCur;
258 break;
259
260 case WBP_ALetter:
261 if ((wbcSeqStart == WBP_ALetter) || /* WB5,6,7 */
262 (wbcLast == WBP_Numeric) || /* WB10 */
263 (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */
264 {
265 set_brks_to(s, brks, posLast, posCur, len,
266 WORDBREAK_NOBREAK, get_next_char);
267 }
268 /* No rule found, reset */
269 else
270 {
271 set_brks_to(s, brks, posLast, posCur, len,
272 WORDBREAK_BREAK, get_next_char);
273 }
274 wbcSeqStart = wbcCur;
275 posLast = posCur;
276 break;
277
278 case WBP_MidNumLet:
279 if ((wbcLast == WBP_ALetter) || /* WB6,7 */
280 (wbcLast == WBP_Numeric)) /* WB11,12 */
281 {
282 /* Go on */
283 }
284 else
285 {
286 set_brks_to(s, brks, posLast, posCur, len,
287 WORDBREAK_BREAK, get_next_char);
288 wbcSeqStart = wbcCur;
289 posLast = posCur;
290 }
291 break;
292
293 case WBP_MidLetter:
294 if (wbcLast == WBP_ALetter) /* WB6,7 */
295 {
296 /* Go on */
297 }
298 else
299 {
300 set_brks_to(s, brks, posLast, posCur, len,
301 WORDBREAK_BREAK, get_next_char);
302 wbcSeqStart = wbcCur;
303 posLast = posCur;
304 }
305 break;
306
307 case WBP_MidNum:
308 if (wbcLast == WBP_Numeric) /* WB11,12 */
309 {
310 /* Go on */
311 }
312 else
313 {
314 set_brks_to(s, brks, posLast, posCur, len,
315 WORDBREAK_BREAK, get_next_char);
316 wbcSeqStart = wbcCur;
317 posLast = posCur;
318 }
319 break;
320
321 case WBP_Numeric:
322 if ((wbcSeqStart == WBP_Numeric) || /* WB8,11,12 */
323 (wbcLast == WBP_ALetter) || /* WB9 */
324 (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */
325 {
326 set_brks_to(s, brks, posLast, posCur, len,
327 WORDBREAK_NOBREAK, get_next_char);
328 }
329 /* No rule found, reset */
330 else
331 {
332 set_brks_to(s, brks, posLast, posCur, len,
333 WORDBREAK_BREAK, get_next_char);
334 }
335 wbcSeqStart = wbcCur;
336 posLast = posCur;
337 break;
338
339 case WBP_ExtendNumLet:
340 /* WB13a,13b */
341 if ((wbcSeqStart == wbcLast) &&
342 ((wbcLast == WBP_ALetter) ||
343 (wbcLast == WBP_Numeric) ||
344 (wbcLast == WBP_Katakana) ||
345 (wbcLast == WBP_ExtendNumLet)))
346 {
347 set_brks_to(s, brks, posLast, posCur, len,
348 WORDBREAK_NOBREAK, get_next_char);
349 }
350 /* No rule found, reset */
351 else
352 {
353 set_brks_to(s, brks, posLast, posCur, len,
354 WORDBREAK_BREAK, get_next_char);
355 }
356 wbcSeqStart = wbcCur;
357 posLast = posCur;
358 break;
359
360 case WBP_Regional:
361 /* WB13c */
362 if (wbcSeqStart == WBP_Regional)
363 {
364 set_brks_to(s, brks, posLast, posCur, len,
365 WORDBREAK_NOBREAK, get_next_char);
366 }
367 wbcSeqStart = wbcCur;
368 posLast = posCur;
369 break;
370
371 case WBP_Any:
372 /* Allow breaks and reset */
373 set_brks_to(s, brks, posLast, posCur, len,
374 WORDBREAK_BREAK, get_next_char);
375 wbcSeqStart = wbcCur;
376 posLast = posCur;
377 break;
378
379 default:
380 /* Error, should never get here! */
381 assert(0);
382 break;
383 }
384
385 wbcLast = wbcCur;
386 posCur = posNext;
387 ch = get_next_char(s, len, &posNext);
388 }
389
390 /* WB2 */
391 set_brks_to(s, brks, posLast, posNext, len,
392 WORDBREAK_BREAK, get_next_char);
393}
394
395/**
396 * Sets the word breaking information for a UTF-8 input string.
397 *
398 * @param[in] s input UTF-8 string
399 * @param[in] len length of the input
400 * @param[in] lang language of the input
401 * @param[out] brks pointer to the output breaking data, containing
402 * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
403 * #WORDBREAK_INSIDEACHAR
404 */
405void set_wordbreaks_utf8(
406 const utf8_t *s,
407 size_t len,
408 const char *lang,
409 char *brks)
410{
411 set_wordbreaks(s, len, lang, brks,
412 (get_next_char_t)lb_get_next_char_utf8);
413}
414
415/**
416 * Sets the word breaking information for a UTF-16 input string.
417 *
418 * @param[in] s input UTF-16 string
419 * @param[in] len length of the input
420 * @param[in] lang language of the input
421 * @param[out] brks pointer to the output breaking data, containing
422 * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
423 * #WORDBREAK_INSIDEACHAR
424 */
425void set_wordbreaks_utf16(
426 const utf16_t *s,
427 size_t len,
428 const char *lang,
429 char *brks)
430{
431 set_wordbreaks(s, len, lang, brks,
432 (get_next_char_t)lb_get_next_char_utf16);
433}
434
435/**
436 * Sets the word breaking information for a UTF-32 input string.
437 *
438 * @param[in] s input UTF-32 string
439 * @param[in] len length of the input
440 * @param[in] lang language of the input
441 * @param[out] brks pointer to the output breaking data, containing
442 * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
443 * #WORDBREAK_INSIDEACHAR
444 */
445void set_wordbreaks_utf32(
446 const utf32_t *s,
447 size_t len,
448 const char *lang,
449 char *brks)
450{
451 set_wordbreaks(s, len, lang, brks,
452 (get_next_char_t)lb_get_next_char_utf32);
453}