forked from aburch/simutrans
-
Notifications
You must be signed in to change notification settings - Fork 53
/
Copy pathunicode.h
83 lines (65 loc) · 2.51 KB
/
unicode.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
/*
* This file is part of the Simutrans-Extended project under the Artistic License.
* (see LICENSE.txt)
*/
#ifndef UNICODE_H
#define UNICODE_H
#include <stddef.h>
#include "simtypes.h"
extern utf32 const UNICODE_NUL;
// the bytes in a sequence have always the format 10xxxxxx
inline int is_cont_char(utf8 c) { return (c & 0xC0) == 0x80; }
/**
* UTF-8 string decoder that can be used to iterate through all code points.
*/
class utf8_decoder_t
{
private:
// Pointer to UTF-8 formated C string.
utf8 const *utf8str;
public:
// Constructs a UTF-8 decoder for the given C string.
utf8_decoder_t(utf8 const *str);
/**
* Decodes a Unicode code point from the byte sequence pointed to by buff.
* On return buff has been advanced to point at the beginning of the next Unicode code point.
* Does not respect NUL terminator character, care should be taken to detect the emmited UNICODE_NUL when decoding C strings to avoid buffer over run errors.
* Invalid Unicode sequences are intepreted using ISO-8859-1 and advance buff 1 byte.
*/
static utf32 decode(utf8 const *&buff);
/**
* Decodes a Unicode code point from the byte sequence pointed to by buff.
* On return len contains the length of the Unicode character in bytes.
* Does not respect NUL terminator character, care should be taken to detect the emmited UNICODE_NUL when decoding C strings to avoid buffer over run errors.
* Invalid Unicode sequences are intepreted using ISO-8859-1 with a len of 1.
*/
static utf32 decode(utf8 const *const buff, size_t &len);
/**
* Returns true if there are more code points left to decode.
* Returns false if at end of string.
*/
bool has_next() const;
/**
* Returns the next Unicode code point value in the string.
* Returns UNICODE_NUL if has_next returns false.
*/
utf32 next();
/**
* Returns the current position of the decoder.
* This is a pointer to the next character.
*/
utf8 const *get_position();
};
// unicode save moving in strings
size_t utf8_get_next_char(const utf8 *text, size_t pos);
size_t utf8_get_next_char(const char* text, size_t pos);
sint32 utf8_get_prev_char(const utf8 *text, sint32 pos);
sint32 utf8_get_prev_char(const char* text, sint32 pos);
int utf16_to_utf8(utf16 unicode, utf8 *out);
// returns latin2 or 0 for error
uint8 unicode_to_latin2( utf16 chr );
utf16 latin2_to_unicode( uint8 chr );
// caseless strstr with utf8
const utf8 *utf8caseutf8( const utf8 *haystack_start, const utf8 *needle_start );
const char *utf8caseutf8( const char *haystack, const char *needle );
#endif