1 | // Copyright (c) 2019-2023 Alexander Medvednikov. All rights reserved. |
2 | // Use of this source code is governed by an MIT license |
3 | // that can be found in the LICENSE file. |
4 | module builtin |
5 | |
6 | pub fn utf8_char_len(b u8) int { |
7 | return ((0xe5000000 >> ((b >> 3) & 0x1e)) & 3) + 1 |
8 | } |
9 | |
10 | // Convert utf32 to utf8 |
11 | // utf32 == Codepoint |
12 | pub fn utf32_to_str(code u32) string { |
13 | unsafe { |
14 | mut buffer := malloc_noscan(5) |
15 | res := utf32_to_str_no_malloc(code, buffer) |
16 | if res.len == 0 { |
17 | // the buffer was not used at all |
18 | free(buffer) |
19 | } |
20 | return res |
21 | } |
22 | } |
23 | |
24 | [manualfree; unsafe] |
25 | pub fn utf32_to_str_no_malloc(code u32, buf &u8) string { |
26 | unsafe { |
27 | len := utf32_decode_to_buffer(code, buf) |
28 | if len == 0 { |
29 | return '' |
30 | } |
31 | buf[len] = 0 |
32 | return tos(buf, len) |
33 | } |
34 | } |
35 | |
36 | [manualfree; unsafe] |
37 | pub fn utf32_decode_to_buffer(code u32, buf &u8) int { |
38 | unsafe { |
39 | icode := int(code) // Prevents doing casts everywhere |
40 | mut buffer := &u8(buf) |
41 | if icode <= 127 { |
42 | // 0x7F |
43 | buffer[0] = u8(icode) |
44 | return 1 |
45 | } else if icode <= 2047 { |
46 | // 0x7FF |
47 | buffer[0] = 192 | u8(icode >> 6) // 0xC0 - 110xxxxx |
48 | buffer[1] = 128 | u8(icode & 63) // 0x80 - 0x3F - 10xxxxxx |
49 | return 2 |
50 | } else if icode <= 65535 { |
51 | // 0xFFFF |
52 | buffer[0] = 224 | u8(icode >> 12) // 0xE0 - 1110xxxx |
53 | buffer[1] = 128 | (u8(icode >> 6) & 63) // 0x80 - 0x3F - 10xxxxxx |
54 | buffer[2] = 128 | u8(icode & 63) // 0x80 - 0x3F - 10xxxxxx |
55 | return 3 |
56 | } |
57 | // 0x10FFFF |
58 | else if icode <= 1114111 { |
59 | buffer[0] = 240 | u8(icode >> 18) // 0xF0 - 11110xxx |
60 | buffer[1] = 128 | (u8(icode >> 12) & 63) // 0x80 - 0x3F - 10xxxxxx |
61 | buffer[2] = 128 | (u8(icode >> 6) & 63) // 0x80 - 0x3F - 10xxxxxx |
62 | buffer[3] = 128 | u8(icode & 63) // 0x80 - 0x3F - 10xxxxxx |
63 | return 4 |
64 | } |
65 | } |
66 | return 0 |
67 | } |
68 | |
69 | // Convert utf8 to utf32 |
70 | // the original implementation did not check for |
71 | // valid utf8 in the string, and could result in |
72 | // values greater than the utf32 spec |
73 | // it has been replaced by `utf8_to_utf32` which |
74 | // has an option return type. |
75 | // |
76 | // this function is left for backward compatibility |
77 | // it is used in vlib/builtin/string.v, |
78 | // and also in vlib/v/gen/c/cgen.v |
79 | pub fn (_rune string) utf32_code() int { |
80 | return int(_rune.bytes().utf8_to_utf32() or { |
81 | // error('more than one utf-8 rune found in this string') |
82 | rune(0) |
83 | }) |
84 | } |
85 | |
86 | // convert array of utf8 bytes to single utf32 value |
87 | // will error if more than 4 bytes are submitted |
88 | pub fn (_bytes []u8) utf8_to_utf32() !rune { |
89 | if _bytes.len == 0 { |
90 | return 0 |
91 | } |
92 | // return ASCII unchanged |
93 | if _bytes.len == 1 { |
94 | return rune(_bytes[0]) |
95 | } |
96 | if _bytes.len > 4 { |
97 | return error('attempted to decode too many bytes, utf-8 is limited to four bytes maximum') |
98 | } |
99 | |
100 | mut b := u8(int(_bytes[0])) |
101 | |
102 | b = b << _bytes.len |
103 | mut res := rune(b) |
104 | mut shift := 6 - _bytes.len |
105 | for i := 1; i < _bytes.len; i++ { |
106 | c := rune(_bytes[i]) |
107 | res = rune(res) << shift |
108 | res |= c & 63 // 0x3f |
109 | shift = 6 |
110 | } |
111 | return res |
112 | } |
113 | |
114 | // Calculate string length for formatting, i.e. number of "characters" |
115 | // This is simplified implementation. if you need specification compliant width, |
116 | // use utf8.east_asian.display_width. |
117 | pub fn utf8_str_visible_length(s string) int { |
118 | mut l := 0 |
119 | mut ul := 1 |
120 | for i := 0; i < s.len; i += ul { |
121 | c := unsafe { s.str[i] } |
122 | ul = ((0xe5000000 >> ((unsafe { s.str[i] } >> 3) & 0x1e)) & 3) + 1 |
123 | if i + ul > s.len { // incomplete UTF-8 sequence |
124 | return l |
125 | } |
126 | l++ |
127 | // avoid the match if not needed |
128 | if ul == 1 { |
129 | continue |
130 | } |
131 | // recognize combining characters and wide characters |
132 | match ul { |
133 | 2 { |
134 | r := u64((u16(c) << 8) | unsafe { s.str[i + 1] }) |
135 | if r >= 0xcc80 && r < 0xcdb0 { |
136 | // diacritical marks |
137 | l-- |
138 | } |
139 | } |
140 | 3 { |
141 | r := u64((u32(c) << 16) | unsafe { (u32(s.str[i + 1]) << 8) | s.str[i + 2] }) |
142 | // diacritical marks extended |
143 | // diacritical marks supplement |
144 | // diacritical marks for symbols |
145 | if (r >= 0xe1aab0 && r <= 0xe1ac7f) |
146 | || (r >= 0xe1b780 && r <= 0xe1b87f) |
147 | || (r >= 0xe28390 && r <= 0xe2847f) |
148 | || (r >= 0xefb8a0 && r <= 0xefb8af) { |
149 | // diacritical marks |
150 | l-- |
151 | } |
152 | // Hangru |
153 | // CJK Unified Ideographics |
154 | // Hangru |
155 | // CJK |
156 | else if (r >= 0xe18480 && r <= 0xe1859f) |
157 | || (r >= 0xe2ba80 && r <= 0xe2bf95) |
158 | || (r >= 0xe38080 && r <= 0xe4b77f) |
159 | || (r >= 0xe4b880 && r <= 0xea807f) |
160 | || (r >= 0xeaa5a0 && r <= 0xeaa79f) |
161 | || (r >= 0xeab080 && r <= 0xed9eaf) |
162 | || (r >= 0xefa480 && r <= 0xefac7f) |
163 | || (r >= 0xefb8b8 && r <= 0xefb9af) { |
164 | // half marks |
165 | l++ |
166 | } |
167 | } |
168 | 4 { |
169 | r := u64((u32(c) << 24) | unsafe { |
170 | (u32(s.str[i + 1]) << 16) | (u32(s.str[i + 2]) << 8) | s.str[i + 3] |
171 | }) |
172 | // Enclosed Ideographic Supplement |
173 | // Emoji |
174 | // CJK Unified Ideographs Extension B-G |
175 | if (r >= 0x0f9f8880 && r <= 0xf09f8a8f) |
176 | || (r >= 0xf09f8c80 && r <= 0xf09f9c90) |
177 | || (r >= 0xf09fa490 && r <= 0xf09fa7af) |
178 | || (r >= 0xf0a08080 && r <= 0xf180807f) { |
179 | l++ |
180 | } |
181 | } |
182 | else {} |
183 | } |
184 | } |
185 | return l |
186 | } |