1 | /* |
2 | regex 1.0 alpha |
3 | |
4 | Copyright (c) 2019-2023 Dario Deledda. All rights reserved. |
5 | Use of this source code is governed by an MIT license |
6 | that can be found in the LICENSE file. |
7 | */ |
8 | module regex |
9 | |
10 | import strings |
11 | |
12 | /****************************************************************************** |
13 | * |
14 | * Inits |
15 | * |
16 | ******************************************************************************/ |
17 | // regex_base returns a regex object (`RE`) generated from `pattern` string and |
18 | // detailed information in re_err, err_pos, if an error occurred. |
19 | pub fn regex_base(pattern string) (RE, int, int) { |
20 | // init regex |
21 | mut re := RE{} |
22 | re.prog = []Token{len: pattern.len + 1} // max program length, can not be longer then the pattern |
23 | re.cc = []CharClass{len: pattern.len} // can not be more char class the the length of the pattern |
24 | re.group_csave_flag = false // enable continuos group saving |
25 | re.group_max_nested = pattern.len >> 1 // set max 128 group nested |
26 | re.group_max = pattern.len >> 1 // we can't have more groups than the half of the pattern legth |
27 | |
28 | re.group_stack = []int{len: re.group_max, init: -1} |
29 | re.group_data = []int{len: re.group_max, init: -1} |
30 | |
31 | re_err, err_pos := re.impl_compile(pattern) |
32 | return re, re_err, err_pos |
33 | } |
34 | |
35 | /****************************************************************************** |
36 | * |
37 | * Utilities |
38 | * |
39 | ******************************************************************************/ |
40 | // get_group_bounds_by_name get a group boundaries by its name |
41 | pub fn (re RE) get_group_bounds_by_name(group_name string) (int, int) { |
42 | if group_name in re.group_map { |
43 | tmp_index := re.group_map[group_name] - 1 |
44 | start := re.groups[tmp_index * 2] |
45 | end := re.groups[tmp_index * 2 + 1] |
46 | return start, end |
47 | } |
48 | return -1, -1 |
49 | } |
50 | |
51 | // get_group_by_name get a group boundaries by its name |
52 | pub fn (re RE) get_group_by_name(in_txt string, group_name string) string { |
53 | if group_name in re.group_map { |
54 | tmp_index := re.group_map[group_name] - 1 |
55 | start := re.groups[tmp_index * 2] |
56 | end := re.groups[tmp_index * 2 + 1] |
57 | if start >= 0 && end > start { |
58 | return in_txt[start..end] |
59 | } |
60 | } |
61 | return '' |
62 | } |
63 | |
64 | // get_group_by_id get a group string by its id |
65 | pub fn (re RE) get_group_by_id(in_txt string, group_id int) string { |
66 | if group_id < (re.groups.len >> 1) { |
67 | index := group_id * 2 |
68 | start := re.groups[index] |
69 | end := re.groups[index + 1] |
70 | if start >= 0 && end > start { |
71 | return in_txt[start..end] |
72 | } |
73 | } |
74 | return '' |
75 | } |
76 | |
77 | // get_group_by_id get a group boundaries by its id |
78 | pub fn (re RE) get_group_bounds_by_id(group_id int) (int, int) { |
79 | if group_id < re.group_count { |
80 | index := group_id * 2 |
81 | return re.groups[index], re.groups[index + 1] |
82 | } |
83 | return -1, -1 |
84 | } |
85 | |
86 | pub struct Re_group { |
87 | pub: |
88 | start int = -1 |
89 | end int = -1 |
90 | } |
91 | |
92 | // get_group_list return a list of Re_group for the found groups |
93 | pub fn (re RE) get_group_list() []Re_group { |
94 | mut res := []Re_group{len: re.groups.len >> 1} |
95 | mut gi := 0 |
96 | // println("len: ${re.groups.len} groups: ${re.groups}") |
97 | |
98 | for gi < re.groups.len { |
99 | if re.groups[gi] >= 0 { |
100 | txt_st := re.groups[gi] |
101 | txt_en := re.groups[gi + 1] |
102 | |
103 | // println("#${gi/2} start: ${re.groups[gi]} end: ${re.groups[gi + 1]} ") |
104 | if txt_st >= 0 && txt_en > txt_st { |
105 | tmp := Re_group{ |
106 | start: re.groups[gi] |
107 | end: re.groups[gi + 1] |
108 | } |
109 | // println(tmp) |
110 | res[gi >> 1] = tmp |
111 | } else { |
112 | res[gi >> 1] = Re_group{} |
113 | } |
114 | } |
115 | gi += 2 |
116 | } |
117 | return res |
118 | } |
119 | |
120 | /****************************************************************************** |
121 | * |
122 | * Matchers |
123 | * |
124 | ******************************************************************************/ |
125 | // match_string Match the pattern with the in_txt string |
126 | [direct_array_access] |
127 | pub fn (re &RE) match_string(in_txt string) (int, int) { |
128 | unsafe { |
129 | start, mut end := re.match_base(in_txt.str, in_txt.len + 1) |
130 | if end > in_txt.len { |
131 | end = in_txt.len |
132 | } |
133 | |
134 | if start >= 0 && end > start { |
135 | if (re.flag & f_ms) != 0 && start > 0 { |
136 | return no_match_found, 0 |
137 | } |
138 | if (re.flag & f_me) != 0 && end < in_txt.len { |
139 | if in_txt[end] in new_line_list { |
140 | return start, end |
141 | } |
142 | return no_match_found, 0 |
143 | } |
144 | return start, end |
145 | } |
146 | return start, end |
147 | } |
148 | } |
149 | |
150 | // matches_string Checks if the pattern matches the in_txt string |
151 | pub fn (re &RE) matches_string(in_txt string) bool { |
152 | start, _ := re.match_string(in_txt) |
153 | return start != no_match_found |
154 | } |
155 | |
156 | /****************************************************************************** |
157 | * |
158 | * Finders |
159 | * |
160 | ******************************************************************************/ |
161 | /* |
162 | // find internal implementation HERE for reference do not remove!! |
163 | [direct_array_access] |
164 | fn (mut re RE) find_imp(in_txt string) (int,int) { |
165 | old_flag := re.flag |
166 | re.flag |= f_src // enable search mode |
167 | |
168 | start, mut end := re.match_base(in_txt.str, in_txt.len + 1) |
169 | //print("Find [$start,$end] '${in_txt[start..end]}'") |
170 | if end > in_txt.len { |
171 | end = in_txt.len |
172 | } |
173 | re.flag = old_flag |
174 | |
175 | if start >= 0 && end > start { |
176 | return start, end |
177 | } |
178 | return no_match_found, 0 |
179 | } |
180 | */ |
181 | |
182 | // find try to find the first match in the input string |
183 | [direct_array_access] |
184 | pub fn (mut re RE) find(in_txt string) (int, int) { |
185 | // old_flag := re.flag |
186 | // re.flag |= f_src // enable search mode |
187 | |
188 | mut i := 0 |
189 | for i < in_txt.len { |
190 | mut s := -1 |
191 | mut e := -1 |
192 | unsafe { |
193 | // tmp_str := tos(in_txt.str + i, in_txt.len - i) |
194 | // println("Check: [$tmp_str]") |
195 | s, e = re.match_base(in_txt.str + i, in_txt.len - i + 1) |
196 | |
197 | if s >= 0 && e > s { |
198 | // println("find match in: ${i+s},${i+e} [${in_txt[i+s..i+e]}]") |
199 | // re.flag = old_flag |
200 | mut gi := 0 |
201 | for gi < re.groups.len { |
202 | re.groups[gi] += i |
203 | gi++ |
204 | } |
205 | // when ^ (f_ms) is used, it must match on beginning of string |
206 | if (re.flag & f_ms) != 0 && s > 0 { |
207 | break |
208 | } |
209 | // when $ (f_me) is used, it must match on ending of string |
210 | if (re.flag & f_me) != 0 && i + e < in_txt.len { |
211 | break |
212 | } |
213 | return i + s, i + e |
214 | } |
215 | i++ |
216 | } |
217 | } |
218 | // re.flag = old_flag |
219 | return -1, -1 |
220 | } |
221 | |
222 | // find try to find the first match in the input string strarting from start index |
223 | [direct_array_access] |
224 | pub fn (mut re RE) find_from(in_txt string, start int) (int, int) { |
225 | old_flag := re.flag |
226 | // re.flag |= f_src // enable search mode |
227 | |
228 | mut i := start |
229 | if i < 0 { |
230 | return -1, -1 |
231 | } |
232 | for i < in_txt.len { |
233 | //--- speed references --- |
234 | |
235 | mut s := -1 |
236 | mut e := -1 |
237 | |
238 | unsafe { |
239 | tmp_str := tos(in_txt.str + i, in_txt.len - i) |
240 | s, e = re.match_string(tmp_str) |
241 | } |
242 | //------------------------ |
243 | // s,e = re.find_imp(in_txt[i..]) |
244 | //------------------------ |
245 | if s >= 0 && e > s { |
246 | // println("find match in: ${i+s},${i+e} [${in_txt[i+s..i+e]}]") |
247 | re.flag = old_flag |
248 | mut gi := 0 |
249 | for gi < re.groups.len { |
250 | re.groups[gi] += i |
251 | gi++ |
252 | } |
253 | return i + s, i + e |
254 | } else { |
255 | i++ |
256 | } |
257 | } |
258 | re.flag = old_flag |
259 | return -1, -1 |
260 | } |
261 | |
262 | // find_all find all the non overlapping occurrences of the match pattern and return the start and end index of the match |
263 | // |
264 | // Usage: |
265 | // ```v |
266 | // blurb := 'foobar boo steelbar toolbox foot tooooot' |
267 | // mut re := regex.regex_opt('f|t[eo]+')? |
268 | // res := re.find_all(blurb) // [0, 3, 12, 15, 20, 23, 28, 31, 33, 39] |
269 | // ``` |
270 | [direct_array_access] |
271 | pub fn (mut re RE) find_all(in_txt string) []int { |
272 | // old_flag := re.flag |
273 | // re.flag |= f_src // enable search mode |
274 | |
275 | mut i := 0 |
276 | mut res := []int{} |
277 | |
278 | for i < in_txt.len { |
279 | mut s := -1 |
280 | mut e := -1 |
281 | unsafe { |
282 | // tmp_str := in_txt[i..] |
283 | // tmp_str := tos(in_txt.str + i, in_txt.len - i) |
284 | // println("Check: [$tmp_str]") |
285 | s, e = re.match_base(in_txt.str + i, in_txt.len + 1 - i) |
286 | |
287 | if s >= 0 && e > s { |
288 | res << i + s |
289 | res << i + e |
290 | i += e |
291 | continue |
292 | } |
293 | /* |
294 | if e > 0 { |
295 | i += e |
296 | continue |
297 | } |
298 | */ |
299 | i++ |
300 | } |
301 | } |
302 | // re.flag = old_flag |
303 | return res |
304 | } |
305 | |
306 | // split returns the sections of string around the regex |
307 | // |
308 | // Usage: |
309 | // ```v |
310 | // blurb := 'foobar boo steelbar toolbox foot tooooot' |
311 | // mut re := regex.regex_opt('f|t[eo]+')? |
312 | // res := re.split(blurb) // ['bar boo s', 'lbar ', 'lbox ', 't ', 't'] |
313 | // ``` |
314 | pub fn (mut re RE) split(in_txt string) []string { |
315 | pos := re.find_all(in_txt) |
316 | |
317 | mut sections := []string{cap: pos.len / 2 + 1} |
318 | |
319 | if pos.len == 0 { |
320 | return [in_txt] |
321 | } |
322 | for i := 0; i < pos.len; i += 2 { |
323 | if pos[i] == 0 { |
324 | continue |
325 | } |
326 | if i == 0 { |
327 | sections << in_txt[..pos[i]] |
328 | } else { |
329 | sections << in_txt[pos[i - 1]..pos[i]] |
330 | } |
331 | } |
332 | if pos[pos.len - 1] != in_txt.len { |
333 | sections << in_txt[pos[pos.len - 1]..] |
334 | } |
335 | return sections |
336 | } |
337 | |
338 | // find_all_str find all the non overlapping occurrences of the match pattern, return a string list |
339 | [direct_array_access] |
340 | pub fn (mut re RE) find_all_str(in_txt string) []string { |
341 | // old_flag := re.flag |
342 | // re.flag |= f_src // enable search mode |
343 | |
344 | mut i := 0 |
345 | mut res := []string{} |
346 | |
347 | for i < in_txt.len { |
348 | mut s := -1 |
349 | mut e := -1 |
350 | unsafe { |
351 | // tmp_str := in_txt[i..] |
352 | // tmp_str := tos(in_txt.str + i, in_txt.len - i) |
353 | // println("Check: [$tmp_str]") |
354 | s, e = re.match_base(in_txt.str + i, in_txt.len + 1 - i) |
355 | |
356 | if s >= 0 && e > s { |
357 | tmp_str := tos(in_txt.str + i, in_txt.len - i) |
358 | mut tmp_e := if e > tmp_str.len { tmp_str.len } else { e } |
359 | // println("Found: $s:$e [${tmp_str[s..e]}]") |
360 | res << tmp_str[..tmp_e] |
361 | i += e |
362 | continue |
363 | } |
364 | } |
365 | /* |
366 | if e > 0 { |
367 | i += e |
368 | continue |
369 | } |
370 | */ |
371 | i++ |
372 | } |
373 | // re.flag = old_flag |
374 | return res |
375 | } |
376 | |
377 | /****************************************************************************** |
378 | * |
379 | * Replacers |
380 | * |
381 | ******************************************************************************/ |
382 | // replace_simple return a string where the matches are replaced with the replace string |
383 | pub fn (mut re RE) replace_simple(in_txt string, repl string) string { |
384 | pos := re.find_all(in_txt) |
385 | |
386 | if pos.len > 0 { |
387 | mut res := '' |
388 | mut i := 0 |
389 | |
390 | mut s1 := 0 |
391 | mut e1 := in_txt.len |
392 | |
393 | for i < pos.len { |
394 | e1 = pos[i] |
395 | res += in_txt[s1..e1] + repl |
396 | s1 = pos[i + 1] |
397 | i += 2 |
398 | } |
399 | |
400 | res += in_txt[s1..] |
401 | return res |
402 | } |
403 | return in_txt |
404 | } |
405 | |
406 | // type of function used for custom replace |
407 | // in_txt source text |
408 | // start index of the start of the match in in_txt |
409 | // end index of the end of the match in in_txt |
410 | // the match is in in_txt[start..end] |
411 | pub type FnReplace = fn (re RE, in_txt string, start int, end int) string |
412 | |
413 | // replace_by_fn return a string where the matches are replaced with the string from the repl_fn callback function |
414 | pub fn (mut re RE) replace_by_fn(in_txt string, repl_fn FnReplace) string { |
415 | mut i := 0 |
416 | mut res := strings.new_builder(in_txt.len) |
417 | mut last_end := 0 |
418 | |
419 | for i < in_txt.len { |
420 | // println("Find Start. $i [${in_txt[i..]}]") |
421 | s, e := re.find_from(in_txt, i) |
422 | // println("Find End.") |
423 | if s >= 0 && e > s { |
424 | // println("find match in: ${s},${e} [${in_txt[s..e]}]") |
425 | |
426 | if last_end < s { |
427 | res.write_string(in_txt[last_end..s]) |
428 | } |
429 | /* |
430 | for g_i in 0 .. re.group_count { |
431 | re.groups[g_i * 2] += i |
432 | re.groups[(g_i * 2) + 1] += i |
433 | } |
434 | */ |
435 | repl := repl_fn(re, in_txt, s, e) |
436 | // println("repl res: $repl") |
437 | res.write_string(repl) |
438 | // res.write_string("[[${in_txt[s..e]}]]") |
439 | |
440 | last_end = e |
441 | i = e |
442 | } else { |
443 | break |
444 | // i++ |
445 | } |
446 | // println(i) |
447 | } |
448 | if last_end >= 0 && last_end < in_txt.len { |
449 | res.write_string(in_txt[last_end..]) |
450 | } |
451 | return res.str() |
452 | } |
453 | |
454 | fn (re RE) parsed_replace_string(in_txt string, repl string) string { |
455 | str_lst := repl.split('\\') |
456 | mut res := str_lst[0] |
457 | mut i := 1 |
458 | for i < str_lst.len { |
459 | tmp := str_lst[i] |
460 | // println("tmp: ${tmp}") |
461 | if tmp.len > 0 && tmp[0] >= `0` && tmp[0] <= `9` { |
462 | group_id := int(tmp[0] - `0`) |
463 | group := re.get_group_by_id(in_txt, group_id) |
464 | // println("group: $group_id [$group]") |
465 | res += '${group}${tmp[1..]}' |
466 | } else { |
467 | res += '\\' + tmp |
468 | } |
469 | i++ |
470 | } |
471 | return res |
472 | } |
473 | |
474 | // replace return a string where the matches are replaced with the repl_str string, |
475 | // this function support use groups in the replace string |
476 | pub fn (mut re RE) replace(in_txt string, repl_str string) string { |
477 | mut i := 0 |
478 | mut res := strings.new_builder(in_txt.len) |
479 | mut last_end := 0 |
480 | |
481 | for i < in_txt.len { |
482 | // println("Find Start. $i [${in_txt[i..]}]") |
483 | s, e := re.find_from(in_txt, i) |
484 | // println("Find End.") |
485 | if s >= 0 && e > s { |
486 | // println("find match in: ${s},${e} [${in_txt[s..e]}]") |
487 | |
488 | if last_end < s { |
489 | res.write_string(in_txt[last_end..s]) |
490 | } |
491 | /* |
492 | for g_i in 0 .. re.group_count { |
493 | re.groups[g_i * 2] += i |
494 | re.groups[(g_i * 2) + 1] += i |
495 | } |
496 | */ |
497 | // repl := repl_fn(re, in_txt, s, e) |
498 | repl := re.parsed_replace_string(in_txt, repl_str) |
499 | // println("repl res: $repl") |
500 | res.write_string(repl) |
501 | // res.write_string("[[${in_txt[s..e]}]]") |
502 | |
503 | last_end = e |
504 | i = e |
505 | } else { |
506 | break |
507 | // i++ |
508 | } |
509 | // println(i) |
510 | } |
511 | if last_end >= 0 && last_end < in_txt.len { |
512 | res.write_string(in_txt[last_end..]) |
513 | } |
514 | return res.str() |
515 | } |
516 | |
517 | // replace_n return a string where the firts count matches are replaced with the repl_str string, |
518 | // if count is > 0 the replace began from the start of the string toward the end |
519 | // if count is < 0 the replace began from the end of the string toward the start |
520 | // if count is 0 do nothing |
521 | pub fn (mut re RE) replace_n(in_txt string, repl_str string, count int) string { |
522 | mut i := 0 |
523 | mut index := 0 |
524 | mut i_p := 0 |
525 | mut res := strings.new_builder(in_txt.len) |
526 | mut lst := re.find_all(in_txt) |
527 | |
528 | if count < 0 { // start from the right of the string |
529 | lst = unsafe { lst#[count * 2..] } // limitate the number of substitions |
530 | } else if count > 0 { // start from the left of the string |
531 | lst = unsafe { lst#[..count * 2] } // limitate the number of substitions |
532 | } else if count == 0 { // no replace |
533 | return in_txt |
534 | } |
535 | |
536 | // println("found: ${lst}") |
537 | for index < lst.len { |
538 | i = lst[index] |
539 | res.write_string(in_txt[i_p..i]) |
540 | res.write_string(repl_str) |
541 | index++ |
542 | i_p = lst[index] |
543 | index++ |
544 | } |
545 | i = i_p |
546 | res.write_string(in_txt[i..]) |
547 | |
548 | return res.str() |
549 | } |