/* vim: set expandtab tabstop=4 softtabstop=4 shiftwidth=4: */ /* * Break processing in a Unicode sequence. Designed to be used in a * generic text renderer. * * Copyright (C) 2015-2016 Wu Yongwei * * This software is provided 'as-is', without any express or implied * warranty. In no event will the author be held liable for any damages * arising from the use of this software. * * Permission is granted to anyone to use this software for any purpose, * including commercial applications, and to alter it and redistribute * it freely, subject to the following restrictions: * * 1. The origin of this software must not be misrepresented; you must * not claim that you wrote the original software. If you use this * software in a product, an acknowledgement in the product * documentation would be appreciated but is not required. * 2. Altered source versions must be plainly marked as such, and must * not be misrepresented as being the original software. * 3. This notice may not be removed or altered from any source * distribution. */ /** * @file unibreakdef.c * * Definition of utility functions used by the libunibreak library. * * @author Wu Yongwei */ #include #include #include "unibreakdef.h" /** * Gets the next Unicode character in a UTF-8 sequence. The index will * be advanced to the next complete character, unless the end of string * is reached in the middle of a UTF-8 sequence. * * @param[in] s input UTF-8 string * @param[in] len length of the string in bytes * @param[in,out] ip pointer to the index * @return the Unicode character beginning at the index; or * #EOS if end of input is encountered */ utf32_t ub_get_next_char_utf8( const utf8_t *s, size_t len, size_t *ip) { utf8_t ch; utf32_t res; assert(*ip <= len); if (*ip == len) return EOS; ch = s[*ip]; if (ch < 0xC2 || ch > 0xF4) { /* One-byte sequence, tail (should not occur), or invalid */ *ip += 1; return ch; } else if (ch < 0xE0) { /* Two-byte sequence */ if (*ip + 2 > len) return EOS; res = ((ch & 0x1F) << 6) + (s[*ip + 1] & 0x3F); *ip += 2; return res; } else if (ch < 0xF0) { /* Three-byte sequence */ if (*ip + 3 > len) return EOS; res = ((ch & 0x0F) << 12) + ((s[*ip + 1] & 0x3F) << 6) + ((s[*ip + 2] & 0x3F)); *ip += 3; return res; } else { /* Four-byte sequence */ if (*ip + 4 > len) return EOS; res = ((ch & 0x07) << 18) + ((s[*ip + 1] & 0x3F) << 12) + ((s[*ip + 2] & 0x3F) << 6) + ((s[*ip + 3] & 0x3F)); *ip += 4; return res; } } /** * Gets the next Unicode character in a UTF-16 sequence. The index will * be advanced to the next complete character, unless the end of string * is reached in the middle of a UTF-16 surrogate pair. * * @param[in] s input UTF-16 string * @param[in] len length of the string in words * @param[in,out] ip pointer to the index * @return the Unicode character beginning at the index; or * #EOS if end of input is encountered */ utf32_t ub_get_next_char_utf16( const utf16_t *s, size_t len, size_t *ip) { utf16_t ch; assert(*ip <= len); if (*ip == len) return EOS; ch = s[(*ip)++]; if (ch < 0xD800 || ch > 0xDBFF) { /* If the character is not a high surrogate */ return ch; } if (*ip == len) { /* If the input ends here (an error) */ --(*ip); return EOS; } if (s[*ip] < 0xDC00 || s[*ip] > 0xDFFF) { /* If the next character is not the low surrogate (an error) */ return ch; } /* Return the constructed character and advance the index again */ return (((utf32_t)ch & 0x3FF) << 10) + (s[(*ip)++] & 0x3FF) + 0x10000; } /** * Gets the next Unicode character in a UTF-32 sequence. The index will * be advanced to the next character. * * @param[in] s input UTF-32 string * @param[in] len length of the string in dwords * @param[in,out] ip pointer to the index * @return the Unicode character beginning at the index; or * #EOS if end of input is encountered */ utf32_t ub_get_next_char_utf32( const utf32_t *s, size_t len, size_t *ip) { assert(*ip <= len); if (*ip == len) return EOS; return s[(*ip)++]; }