summaryrefslogtreecommitdiff
path: root/src/static_libs/libunibreak/unibreakdef.c
blob: 20ce2b3484ded04746b2ab8e7273b033c9faaceb (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
/* vim: set expandtab tabstop=4 softtabstop=4 shiftwidth=4: */

/*
 * Break processing in a Unicode sequence.  Designed to be used in a
 * generic text renderer.
 *
 * Copyright (C) 2015 Wu Yongwei <wuyongwei at gmail dot com>
 *
 * This software is provided 'as-is', without any express or implied
 * warranty.  In no event will the author be held liable for any damages
 * arising from the use of this software.
 *
 * Permission is granted to anyone to use this software for any purpose,
 * including commercial applications, and to alter it and redistribute
 * it freely, subject to the following restrictions:
 *
 * 1. The origin of this software must not be misrepresented; you must
 *    not claim that you wrote the original software.  If you use this
 *    software in a product, an acknowledgement in the product
 *    documentation would be appreciated but is not required.
 * 2. Altered source versions must be plainly marked as such, and must
 *    not be misrepresented as being the original software.
 * 3. This notice may not be removed or altered from any source
 *    distribution.
 */

/**
 * @file    unibreakdef.c
 *
 * Definition of utility functions used by the libunibreak library.
 *
 * @version 3.0, 2015/05/10
 * @author  Wu Yongwei
 */

#include <assert.h>
#include <stddef.h>
#include "unibreakdef.h"

/**
 * Gets the next Unicode character in a UTF-8 sequence.  The index will
 * be advanced to the next complete character, unless the end of string
 * is reached in the middle of a UTF-8 sequence.
 *
 * @param[in]     s    input UTF-8 string
 * @param[in]     len  length of the string in bytes
 * @param[in,out] ip   pointer to the index
 * @return             the Unicode character beginning at the index; or
 *                     #EOS if end of input is encountered
 */
utf32_t ub_get_next_char_utf8(
        const utf8_t *s,
        size_t len,
        size_t *ip)
{
    utf8_t ch;
    utf32_t res;

    assert(*ip <= len);
    if (*ip == len)
        return EOS;
    ch = s[*ip];

    if (ch < 0xC2 || ch > 0xF4)
    {   /* One-byte sequence, tail (should not occur), or invalid */
        *ip += 1;
        return ch;
    }
    else if (ch < 0xE0)
    {   /* Two-byte sequence */
        if (*ip + 2 > len)
            return EOS;
        res = ((ch & 0x1F) << 6) + (s[*ip + 1] & 0x3F);
        *ip += 2;
        return res;
    }
    else if (ch < 0xF0)
    {   /* Three-byte sequence */
        if (*ip + 3 > len)
            return EOS;
        res = ((ch & 0x0F) << 12) +
              ((s[*ip + 1] & 0x3F) << 6) +
              ((s[*ip + 2] & 0x3F));
        *ip += 3;
        return res;
    }
    else
    {   /* Four-byte sequence */
        if (*ip + 4 > len)
            return EOS;
        res = ((ch & 0x07) << 18) +
              ((s[*ip + 1] & 0x3F) << 12) +
              ((s[*ip + 2] & 0x3F) << 6) +
              ((s[*ip + 3] & 0x3F));
        *ip += 4;
        return res;
    }
}

/**
 * Gets the next Unicode character in a UTF-16 sequence.  The index will
 * be advanced to the next complete character, unless the end of string
 * is reached in the middle of a UTF-16 surrogate pair.
 *
 * @param[in]     s    input UTF-16 string
 * @param[in]     len  length of the string in words
 * @param[in,out] ip   pointer to the index
 * @return             the Unicode character beginning at the index; or
 *                     #EOS if end of input is encountered
 */
utf32_t ub_get_next_char_utf16(
        const utf16_t *s,
        size_t len,
        size_t *ip)
{
    utf16_t ch;

    assert(*ip <= len);
    if (*ip == len)
        return EOS;
    ch = s[(*ip)++];

    if (ch < 0xD800 || ch > 0xDBFF)
    {   /* If the character is not a high surrogate */
        return ch;
    }
    if (*ip == len)
    {   /* If the input ends here (an error) */
        --(*ip);
        return EOS;
    }
    if (s[*ip] < 0xDC00 || s[*ip] > 0xDFFF)
    {   /* If the next character is not the low surrogate (an error) */
        return ch;
    }
    /* Return the constructed character and advance the index again */
    return (((utf32_t)ch & 0x3FF) << 10) + (s[(*ip)++] & 0x3FF) + 0x10000;
}

/**
 * Gets the next Unicode character in a UTF-32 sequence.  The index will
 * be advanced to the next character.
 *
 * @param[in]     s    input UTF-32 string
 * @param[in]     len  length of the string in dwords
 * @param[in,out] ip   pointer to the index
 * @return             the Unicode character beginning at the index; or
 *                     #EOS if end of input is encountered
 */
utf32_t ub_get_next_char_utf32(
        const utf32_t *s,
        size_t len,
        size_t *ip)
{
    assert(*ip <= len);
    if (*ip == len)
        return EOS;
    return s[(*ip)++];
}