You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
273 lines
8.6 KiB
C
273 lines
8.6 KiB
C
/* mini_utf8.h
|
|
*
|
|
* Gunnar Zötl <gz@tset.de> 2014
|
|
*
|
|
* a tiny library to deal with utf8 encoded strings. Tries to fault
|
|
* invalid unicode codepoints and invalid utf8 sequences.
|
|
*
|
|
* Stuff starting with _mini_utf8_* is reserved and private. Don't name your
|
|
* identifiers like that, and don't use stuff named like that.
|
|
*
|
|
* Needed #includes:
|
|
* -----------------
|
|
* -
|
|
*
|
|
* Functions:
|
|
* ----------
|
|
*
|
|
* int mini_utf8_check_encoding(const char* str)
|
|
* test all characters in a string for valid utf8 encoding. Returns
|
|
* 0 if the string is valid utf8, 1 if it is pure ASCII, or -1, if
|
|
* the string is not valid utf8. We do a somewhat relaxed test in
|
|
* that all chars in the range [0x01-0x1F] are considered valid.
|
|
*
|
|
* int mini_utf8_decode(const char **str)
|
|
* returns the next valid utf8 character from *str, updating *str
|
|
* to point behind that char. If *str points to a 0 byte, 0 is
|
|
* returned and *str is not updated. If *str does not point to a
|
|
* valid utf8 encoded char, -1 is returned and *str is not updated.
|
|
*
|
|
* int mini_utf8_encode(int cp, const char* str, int len)
|
|
* encodes the codepoint cp into an utf8 byte sequence and stores
|
|
* that into str, where len bytes are available. If that went without
|
|
* errors, the length of the encoded sequence is returned. If cp is
|
|
* not a valid code point, -1 is returned, for all other problems,
|
|
* 0 is returned. If cp is 0, it is stored as a single byte 0, even
|
|
* if that is not really valid utf8. Also, all chars in the range
|
|
* [0x01-0x1F] are considered valid.
|
|
*
|
|
* int mini_utf8_strlen(const char *str)
|
|
* returns the number of utf8 codepoints in the string str, or -1 if
|
|
* the string contains invalid utf8 sequences.
|
|
*
|
|
* int mini_utf8_byteoffset(const char *str, int cpno)
|
|
* returns the number of bytes from the start of the string to the
|
|
* start of codepoint number cpno. Returns >=0 for the offset, or
|
|
* -1 if the string had less than cpno codepoints, or contained an
|
|
* invalid utf8 sequence.
|
|
*
|
|
* Example:
|
|
* --------
|
|
*
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include "mini_utf8.h"
|
|
|
|
int main(int argc, char **argv)
|
|
{
|
|
int size = 0x11FFFF;
|
|
int l = size * 4 + 1, i = 0, ok = 1, cp = 0;
|
|
int *ibuf = calloc(size, sizeof(int));
|
|
char *cbuf = calloc(l, sizeof(char));
|
|
char *str = cbuf;
|
|
|
|
while (cp < size) {
|
|
cp = cp + 1;
|
|
int n = mini_utf8_encode(cp, str, l);
|
|
if (n > 0) {
|
|
l -= n;
|
|
str += n;
|
|
ibuf[i++] = cp;
|
|
}
|
|
}
|
|
*str = 0;
|
|
size = i;
|
|
|
|
str = cbuf;
|
|
for (i = 0; ok && (i < size); ++i) {
|
|
cp = mini_utf8_decode((const char**)&str);
|
|
ok = (cp == ibuf[i]);
|
|
}
|
|
|
|
ok = ok && (mini_utf8_strlen(cbuf) == size);
|
|
|
|
printf("Roundtrip test %s.\n", ok ? "succeeded" : "failed");
|
|
|
|
ok = mini_utf8_check_encoding(cbuf);
|
|
|
|
printf("utf8 check %s.\n", ok >= 0 ? "succeeded" : "failed");
|
|
|
|
return ok < 0;
|
|
}
|
|
*
|
|
* License:
|
|
* --------
|
|
*
|
|
* Copyright (c) 2014 Gunnar Zötl <gz@tset.de>
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person
|
|
* obtaining a copy of this software and associated documentation
|
|
* files (the "Software"), to deal in the Software without
|
|
* restriction, including without limitation the rights to use,
|
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
* copies of the Software, and to permit persons to whom the
|
|
* Software is furnished to do so, subject to the following
|
|
* conditions:
|
|
*
|
|
* The above copyright notice and this permission notice shall be
|
|
* included in all copies or substantial portions of the Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
* OTHER DEALINGS IN THE SOFTWARE.
|
|
*/
|
|
|
|
#ifndef _mini_utf8
|
|
#define _mini_utf8
|
|
|
|
#define _mini_utf8_in_range(c, s, e) ((s) <= (c) && (c) <= (e))
|
|
|
|
/* The patterns for the encoding check are taken from
|
|
* http://www.w3.org/International/questions/qa-forms-utf-8
|
|
*/
|
|
static inline int mini_utf8_check_encoding(const char *str)
|
|
{
|
|
const unsigned char *s = (const unsigned char*) str;
|
|
int isu = 1;
|
|
int isa = 1;
|
|
|
|
while (*s && isu) {
|
|
if (*s <= 0x7F) {
|
|
s += 1;
|
|
continue; /* [\x09\x0A\x0D\x20-\x7E] # ASCII (somewhat relaxed) */
|
|
}
|
|
isa = 0; /* if we get here, the file is not pure ASCII */
|
|
if (_mini_utf8_in_range(*s, 0xC2, 0xDF) && _mini_utf8_in_range(s[1], 0x80, 0xBF)) {
|
|
s += 2; /* [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte */
|
|
} else if (*s == 0xE0 && _mini_utf8_in_range(s[1], 0xA0, 0xBF) && _mini_utf8_in_range(s[2], 0x80, 0xBF)) {
|
|
s += 3; /* \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs */
|
|
} else if ((*s <= 0xEC || *s == 0xEE || *s == 0xEF) && _mini_utf8_in_range(s[1], 0x80, 0xBF) && _mini_utf8_in_range(s[2], 0x80, 0xBF)) {
|
|
s += 3; /* [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte */
|
|
} else if (*s == 0xED && _mini_utf8_in_range(s[1], 0x80, 0x9F) && _mini_utf8_in_range(s[2], 0x80, 0xBF)) {
|
|
s += 3; /* \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates */
|
|
} else if (*s == 0xF0 && _mini_utf8_in_range(s[1], 0x90, 0xBF) && _mini_utf8_in_range(s[2], 0x80, 0xBF) && _mini_utf8_in_range(s[3], 0x80, 0xBF)) {
|
|
s += 4; /* \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3 */
|
|
} else if (*s <= 0xF3 && _mini_utf8_in_range(s[1], 0x80, 0xBF) && _mini_utf8_in_range(s[2], 0x80, 0xBF) && _mini_utf8_in_range(s[3], 0x80, 0xBF)) {
|
|
s += 4; /* [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15 */
|
|
} else if (*s == 0xF4 && _mini_utf8_in_range(s[1], 0x80, 0x8F) && _mini_utf8_in_range(s[2], 0x80, 0xBF) && _mini_utf8_in_range(s[3], 0x80, 0xBF)) {
|
|
s += 4; /* \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16 */
|
|
} else
|
|
isu = 0;
|
|
}
|
|
|
|
if (isa && isu)
|
|
return 1;
|
|
else if (isu)
|
|
return 0;
|
|
return -1;
|
|
}
|
|
|
|
/* bits start end bytes encoding
|
|
* 7 U+0000 U+007F 1 0xxxxxxx
|
|
* 11 U+0080 U+07FF 2 110xxxxx 10xxxxxx
|
|
* 16 U+0800 U+FFFF 3 1110xxxx 10xxxxxx 10xxxxxx
|
|
* 21 U+10000 U+1FFFFF 4 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
|
*
|
|
* validity checking derived from above patterns
|
|
*/
|
|
static inline int mini_utf8_decode(const char **str)
|
|
{
|
|
const unsigned char *s = (const unsigned char*) *str;
|
|
int ret = -1;
|
|
if (!*s) return 0;
|
|
|
|
if (*s <= 0x7F) {
|
|
ret = s[0]; /* ASCII */
|
|
*str = (char*) s+1;
|
|
return ret;
|
|
} else if (*s < 0xC2) {
|
|
return -1;
|
|
} else if (*s<= 0xDF) {
|
|
if ((s[1] & 0xC0) != 0x80) return -1;
|
|
ret = ((s[0] & 0x1F) << 6) | (s[1] & 0x3F);
|
|
*str = (char*) s+2;
|
|
return ret;
|
|
} else if (*s <= 0xEF) {
|
|
if ((s[1] & 0xC0) != 0x80) return -1;
|
|
if (*s == 0xE0 && s[1] < 0xA0) return -1;
|
|
if (*s == 0xED && s[1] > 0x9F) return -1;
|
|
if ((s[2] & 0xC0) != 0x80) return -1;
|
|
ret = ((s[0] & 0x0F) << 12) | ((s[1] & 0x3F) << 6) | (s[2] & 0x3F);
|
|
*str = (char*) s+3;
|
|
return ret;
|
|
} else if (*s <= 0xF4) {
|
|
if ((s[1] & 0xC0) != 0x80) return -1;
|
|
if (*s == 0xF0 && s[1] < 0x90) return -1;
|
|
if (*s == 0xF4 && s[1] > 0x8F) return -1;
|
|
if ((s[2] & 0xC0) != 0x80) return -1;
|
|
if ((s[3] & 0xC0) != 0x80) return -1;
|
|
ret = ((s[0] & 0x0F) << 18) | ((s[1] & 0x3F) << 12) | ((s[2] & 0x3F) << 6) | (s[3] & 0x3F);
|
|
*str = (char*) s+4;
|
|
return ret;
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
/* only utf16 surrogate pairs (0xD800-0xDFFF) are invalid unicode
|
|
* codepoints
|
|
*/
|
|
static inline int mini_utf8_encode(int cp, char *str, int len)
|
|
{
|
|
unsigned char *s = (unsigned char*) str;
|
|
if (cp <= 0x7F) {
|
|
if (len < 1) return 0;
|
|
*s = (cp & 0x7F);
|
|
return 1;
|
|
} else if (cp <= 0x7FF) {
|
|
if (len < 2) return 0;
|
|
*s++ = (cp >> 6) | 0xC0;
|
|
*s = (cp & 0x3F) | 0x80;
|
|
return 2;
|
|
} else if (cp <= 0xFFFF) {
|
|
if (0xD800 <= cp && cp <= 0xDFFF) return -1;
|
|
if (len < 3) return 0;
|
|
*s++ = (cp >> 12) | 0xE0;
|
|
*s++ = ((cp >> 6) & 0x3F) | 0x80;
|
|
*s = (cp & 0x3F) | 0x80;
|
|
return 3;
|
|
} else if (cp <= 0x10FFFF) {
|
|
if (len < 4) return 0;
|
|
*s++ =(cp >> 18) | 0xF0;
|
|
*s++ =((cp >> 12) & 0x3F) | 0x80;
|
|
*s++ =((cp >> 6) & 0x3F) | 0x80;
|
|
*s =(cp & 0x3F) | 0x80;
|
|
return 4;
|
|
}
|
|
return -1;
|
|
}
|
|
|
|
static inline int mini_utf8_strlen(const char *str)
|
|
{
|
|
const char *s = str;
|
|
int len = 0;
|
|
int ok = mini_utf8_decode(&s);
|
|
while (ok > 0) {
|
|
++len;
|
|
ok = mini_utf8_decode(&s);
|
|
}
|
|
if (ok == 0)
|
|
return len;
|
|
return -1;
|
|
}
|
|
|
|
static inline int mini_utf8_byteoffset(const char *str, int cpno)
|
|
{
|
|
const char *s = str;
|
|
int cnt = 0;
|
|
int ok = 1;
|
|
for (cnt = 0; (cnt < cpno) && (ok > 0); ++cnt) {
|
|
ok = mini_utf8_decode(&s);
|
|
}
|
|
if (ok > 0)
|
|
return (int)(s - str);
|
|
return -1;
|
|
}
|
|
|
|
#endif /* _mini_utf8 */
|