Gitly

v / vlib / builtin

Raw file | 186 loc (177 sloc) | 5.12 KB | Latest commit hash 90941b3b1


1 // Copyright (c) 2019-2023 Alexander Medvednikov. All rights reserved.
2 // Use of this source code is governed by an MIT license
3 // that can be found in the LICENSE file.
4 module builtin
5 
6 pub fn utf8_char_len(b u8) int {
7     return ((0xe5000000 >> ((b >> 3) & 0x1e)) & 3) + 1
8 }
9 
10 // Convert utf32 to utf8
11 // utf32 == Codepoint
12 pub fn utf32_to_str(code u32) string {
13     unsafe {
14         mut buffer := malloc_noscan(5)
15         res := utf32_to_str_no_malloc(code, buffer)
16         if res.len == 0 {
17             // the buffer was not used at all
18             free(buffer)
19         }
20         return res
21     }
22 }
23 
24 [manualfree; unsafe]
25 pub fn utf32_to_str_no_malloc(code u32, buf &u8) string {
26     unsafe {
27         len := utf32_decode_to_buffer(code, buf)
28         if len == 0 {
29             return ''
30         }
31         buf[len] = 0
32         return tos(buf, len)
33     }
34 }
35 
36 [manualfree; unsafe]
37 pub fn utf32_decode_to_buffer(code u32, buf &u8) int {
38     unsafe {
39         icode := int(code) // Prevents doing casts everywhere
40         mut buffer := &u8(buf)
41         if icode <= 127 {
42             // 0x7F
43             buffer[0] = u8(icode)
44             return 1
45         } else if icode <= 2047 {
46             // 0x7FF
47             buffer[0] = 192 | u8(icode >> 6) // 0xC0 - 110xxxxx
48             buffer[1] = 128 | u8(icode & 63) // 0x80 - 0x3F - 10xxxxxx
49             return 2
50         } else if icode <= 65535 {
51             // 0xFFFF
52             buffer[0] = 224 | u8(icode >> 12) // 0xE0 - 1110xxxx
53             buffer[1] = 128 | (u8(icode >> 6) & 63) // 0x80 - 0x3F - 10xxxxxx
54             buffer[2] = 128 | u8(icode & 63) // 0x80 - 0x3F - 10xxxxxx
55             return 3
56         }
57         // 0x10FFFF
58         else if icode <= 1114111 {
59             buffer[0] = 240 | u8(icode >> 18) // 0xF0 - 11110xxx
60             buffer[1] = 128 | (u8(icode >> 12) & 63) // 0x80 - 0x3F - 10xxxxxx
61             buffer[2] = 128 | (u8(icode >> 6) & 63) // 0x80 - 0x3F - 10xxxxxx
62             buffer[3] = 128 | u8(icode & 63) // 0x80 - 0x3F - 10xxxxxx
63             return 4
64         }
65     }
66     return 0
67 }
68 
69 // Convert utf8 to utf32
70 // the original implementation did not check for
71 // valid utf8 in the string, and could result in
72 // values greater than the utf32 spec
73 // it has been replaced by `utf8_to_utf32` which
74 // has an option return type.
75 //
76 // this function is left for backward compatibility
77 // it is used in vlib/builtin/string.v,
78 // and also in vlib/v/gen/c/cgen.v
79 pub fn (_rune string) utf32_code() int {
80     return int(_rune.bytes().utf8_to_utf32() or {
81         // error('more than one utf-8 rune found in this string')
82         rune(0)
83     })
84 }
85 
86 // convert array of utf8 bytes to single utf32 value
87 // will error if more than 4 bytes are submitted
88 pub fn (_bytes []u8) utf8_to_utf32() !rune {
89     if _bytes.len == 0 {
90         return 0
91     }
92     // return ASCII unchanged
93     if _bytes.len == 1 {
94         return rune(_bytes[0])
95     }
96     if _bytes.len > 4 {
97         return error('attempted to decode too many bytes, utf-8 is limited to four bytes maximum')
98     }
99 
100     mut b := u8(int(_bytes[0]))
101 
102     b = b << _bytes.len
103     mut res := rune(b)
104     mut shift := 6 - _bytes.len
105     for i := 1; i < _bytes.len; i++ {
106         c := rune(_bytes[i])
107         res = rune(res) << shift
108         res |= c & 63 // 0x3f
109         shift = 6
110     }
111     return res
112 }
113 
114 // Calculate string length for formatting, i.e. number of "characters"
115 // This is simplified implementation. if you need specification compliant width,
116 // use utf8.east_asian.display_width.
117 pub fn utf8_str_visible_length(s string) int {
118     mut l := 0
119     mut ul := 1
120     for i := 0; i < s.len; i += ul {
121         c := unsafe { s.str[i] }
122         ul = ((0xe5000000 >> ((unsafe { s.str[i] } >> 3) & 0x1e)) & 3) + 1
123         if i + ul > s.len { // incomplete UTF-8 sequence
124             return l
125         }
126         l++
127         // avoid the match if not needed
128         if ul == 1 {
129             continue
130         }
131         // recognize combining characters and wide characters
132         match ul {
133             2 {
134                 r := u64((u16(c) << 8) | unsafe { s.str[i + 1] })
135                 if r >= 0xcc80 && r < 0xcdb0 {
136                     // diacritical marks
137                     l--
138                 }
139             }
140             3 {
141                 r := u64((u32(c) << 16) | unsafe { (u32(s.str[i + 1]) << 8) | s.str[i + 2] })
142                 // diacritical marks extended
143                 // diacritical marks supplement
144                 // diacritical marks for symbols
145                 if (r >= 0xe1aab0 && r <= 0xe1ac7f)
146                     || (r >= 0xe1b780 && r <= 0xe1b87f)
147                     || (r >= 0xe28390 && r <= 0xe2847f)
148                     || (r >= 0xefb8a0 && r <= 0xefb8af) {
149                     // diacritical marks
150                     l--
151                 }
152                 // Hangru
153                 // CJK Unified Ideographics
154                 // Hangru
155                 // CJK
156                 else if (r >= 0xe18480 && r <= 0xe1859f)
157                     || (r >= 0xe2ba80 && r <= 0xe2bf95)
158                     || (r >= 0xe38080 && r <= 0xe4b77f)
159                     || (r >= 0xe4b880 && r <= 0xea807f)
160                     || (r >= 0xeaa5a0 && r <= 0xeaa79f)
161                     || (r >= 0xeab080 && r <= 0xed9eaf)
162                     || (r >= 0xefa480 && r <= 0xefac7f)
163                     || (r >= 0xefb8b8 && r <= 0xefb9af) {
164                     // half marks
165                     l++
166                 }
167             }
168             4 {
169                 r := u64((u32(c) << 24) | unsafe {
170                     (u32(s.str[i + 1]) << 16) | (u32(s.str[i + 2]) << 8) | s.str[i + 3]
171                 })
172                 // Enclosed Ideographic Supplement
173                 // Emoji
174                 // CJK Unified Ideographs Extension B-G
175                 if (r >= 0x0f9f8880 && r <= 0xf09f8a8f)
176                     || (r >= 0xf09f8c80 && r <= 0xf09f9c90)
177                     || (r >= 0xf09fa490 && r <= 0xf09fa7af)
178                     || (r >= 0xf0a08080 && r <= 0xf180807f) {
179                     l++
180                 }
181             }
182             else {}
183         }
184     }
185     return l
186 }

1	// Copyright (c) 2019-2023 Alexander Medvednikov. All rights reserved.
2	// Use of this source code is governed by an MIT license
3	// that can be found in the LICENSE file.
4	module builtin
5
6	pub fn utf8_char_len(b u8) int {
7	return ((0xe5000000 >> ((b >> 3) & 0x1e)) & 3) + 1
8	}
9
10	// Convert utf32 to utf8
11	// utf32 == Codepoint
12	pub fn utf32_to_str(code u32) string {
13	unsafe {
14	mut buffer := malloc_noscan(5)
15	res := utf32_to_str_no_malloc(code, buffer)
16	if res.len == 0 {
17	// the buffer was not used at all
18	free(buffer)
19	}
20	return res
21	}
22	}
23
24	[manualfree; unsafe]
25	pub fn utf32_to_str_no_malloc(code u32, buf &u8) string {
26	unsafe {
27	len := utf32_decode_to_buffer(code, buf)
28	if len == 0 {
29	return ''
30	}
31	buf[len] = 0
32	return tos(buf, len)
33	}
34	}
35
36	[manualfree; unsafe]
37	pub fn utf32_decode_to_buffer(code u32, buf &u8) int {
38	unsafe {
39	icode := int(code) // Prevents doing casts everywhere
40	mut buffer := &u8(buf)
41	if icode <= 127 {
42	// 0x7F
43	buffer[0] = u8(icode)
44	return 1
45	} else if icode <= 2047 {
46	// 0x7FF
47	buffer[0] = 192 \| u8(icode >> 6) // 0xC0 - 110xxxxx
48	buffer[1] = 128 \| u8(icode & 63) // 0x80 - 0x3F - 10xxxxxx
49	return 2
50	} else if icode <= 65535 {
51	// 0xFFFF
52	buffer[0] = 224 \| u8(icode >> 12) // 0xE0 - 1110xxxx
53	buffer[1] = 128 \| (u8(icode >> 6) & 63) // 0x80 - 0x3F - 10xxxxxx
54	buffer[2] = 128 \| u8(icode & 63) // 0x80 - 0x3F - 10xxxxxx
55	return 3
56	}
57	// 0x10FFFF
58	else if icode <= 1114111 {
59	buffer[0] = 240 \| u8(icode >> 18) // 0xF0 - 11110xxx
60	buffer[1] = 128 \| (u8(icode >> 12) & 63) // 0x80 - 0x3F - 10xxxxxx
61	buffer[2] = 128 \| (u8(icode >> 6) & 63) // 0x80 - 0x3F - 10xxxxxx
62	buffer[3] = 128 \| u8(icode & 63) // 0x80 - 0x3F - 10xxxxxx
63	return 4
64	}
65	}
66	return 0
67	}
68
69	// Convert utf8 to utf32
70	// the original implementation did not check for
71	// valid utf8 in the string, and could result in
72	// values greater than the utf32 spec
73	// it has been replaced by `utf8_to_utf32` which
74	// has an option return type.
75	//
76	// this function is left for backward compatibility
77	// it is used in vlib/builtin/string.v,
78	// and also in vlib/v/gen/c/cgen.v
79	pub fn (_rune string) utf32_code() int {
80	return int(_rune.bytes().utf8_to_utf32() or {
81	// error('more than one utf-8 rune found in this string')
82	rune(0)
83	})
84	}
85
86	// convert array of utf8 bytes to single utf32 value
87	// will error if more than 4 bytes are submitted
88	pub fn (_bytes []u8) utf8_to_utf32() !rune {
89	if _bytes.len == 0 {
90	return 0
91	}
92	// return ASCII unchanged
93	if _bytes.len == 1 {
94	return rune(_bytes[0])
95	}
96	if _bytes.len > 4 {
97	return error('attempted to decode too many bytes, utf-8 is limited to four bytes maximum')
98	}
99
100	mut b := u8(int(_bytes[0]))
101
102	b = b << _bytes.len
103	mut res := rune(b)
104	mut shift := 6 - _bytes.len
105	for i := 1; i < _bytes.len; i++ {
106	c := rune(_bytes[i])
107	res = rune(res) << shift
108	res \|= c & 63 // 0x3f
109	shift = 6
110	}
111	return res
112	}
113
114	// Calculate string length for formatting, i.e. number of "characters"
115	// This is simplified implementation. if you need specification compliant width,
116	// use utf8.east_asian.display_width.
117	pub fn utf8_str_visible_length(s string) int {
118	mut l := 0
119	mut ul := 1
120	for i := 0; i < s.len; i += ul {
121	c := unsafe { s.str[i] }
122	ul = ((0xe5000000 >> ((unsafe { s.str[i] } >> 3) & 0x1e)) & 3) + 1
123	if i + ul > s.len { // incomplete UTF-8 sequence
124	return l
125	}
126	l++
127	// avoid the match if not needed
128	if ul == 1 {
129	continue
130	}
131	// recognize combining characters and wide characters
132	match ul {
133	2 {
134	r := u64((u16(c) << 8) \| unsafe { s.str[i + 1] })
135	if r >= 0xcc80 && r < 0xcdb0 {
136	// diacritical marks
137	l--
138	}
139	}
140	3 {
141	r := u64((u32(c) << 16) \| unsafe { (u32(s.str[i + 1]) << 8) \| s.str[i + 2] })
142	// diacritical marks extended
143	// diacritical marks supplement
144	// diacritical marks for symbols
145	if (r >= 0xe1aab0 && r <= 0xe1ac7f)
146	\|\| (r >= 0xe1b780 && r <= 0xe1b87f)
147	\|\| (r >= 0xe28390 && r <= 0xe2847f)
148	\|\| (r >= 0xefb8a0 && r <= 0xefb8af) {
149	// diacritical marks
150	l--
151	}
152	// Hangru
153	// CJK Unified Ideographics
154	// Hangru
155	// CJK
156	else if (r >= 0xe18480 && r <= 0xe1859f)
157	\|\| (r >= 0xe2ba80 && r <= 0xe2bf95)
158	\|\| (r >= 0xe38080 && r <= 0xe4b77f)
159	\|\| (r >= 0xe4b880 && r <= 0xea807f)
160	\|\| (r >= 0xeaa5a0 && r <= 0xeaa79f)
161	\|\| (r >= 0xeab080 && r <= 0xed9eaf)
162	\|\| (r >= 0xefa480 && r <= 0xefac7f)
163	\|\| (r >= 0xefb8b8 && r <= 0xefb9af) {
164	// half marks
165	l++
166	}
167	}
168	4 {
169	r := u64((u32(c) << 24) \| unsafe {
170	(u32(s.str[i + 1]) << 16) \| (u32(s.str[i + 2]) << 8) \| s.str[i + 3]
171	})
172	// Enclosed Ideographic Supplement
173	// Emoji
174	// CJK Unified Ideographs Extension B-G
175	if (r >= 0x0f9f8880 && r <= 0xf09f8a8f)
176	\|\| (r >= 0xf09f8c80 && r <= 0xf09f9c90)
177	\|\| (r >= 0xf09fa490 && r <= 0xf09fa7af)
178	\|\| (r >= 0xf0a08080 && r <= 0xf180807f) {
179	l++
180	}
181	}
182	else {}
183	}
184	}
185	return l
186	}