v / vlib / regex
Raw file | 549 loc (493 sloc) | 13.13 KB | Latest commit hash 42a9eaac0
1/*
2regex 1.0 alpha
3
4Copyright (c) 2019-2023 Dario Deledda. All rights reserved.
5Use of this source code is governed by an MIT license
6that can be found in the LICENSE file.
7*/
8module regex
9
10import strings
11
12/******************************************************************************
13*
14* Inits
15*
16******************************************************************************/
17// regex_base returns a regex object (`RE`) generated from `pattern` string and
18// detailed information in re_err, err_pos, if an error occurred.
19pub fn regex_base(pattern string) (RE, int, int) {
20 // init regex
21 mut re := RE{}
22 re.prog = []Token{len: pattern.len + 1} // max program length, can not be longer then the pattern
23 re.cc = []CharClass{len: pattern.len} // can not be more char class the the length of the pattern
24 re.group_csave_flag = false // enable continuos group saving
25 re.group_max_nested = pattern.len >> 1 // set max 128 group nested
26 re.group_max = pattern.len >> 1 // we can't have more groups than the half of the pattern legth
27
28 re.group_stack = []int{len: re.group_max, init: -1}
29 re.group_data = []int{len: re.group_max, init: -1}
30
31 re_err, err_pos := re.impl_compile(pattern)
32 return re, re_err, err_pos
33}
34
35/******************************************************************************
36*
37* Utilities
38*
39******************************************************************************/
40// get_group_bounds_by_name get a group boundaries by its name
41pub fn (re RE) get_group_bounds_by_name(group_name string) (int, int) {
42 if group_name in re.group_map {
43 tmp_index := re.group_map[group_name] - 1
44 start := re.groups[tmp_index * 2]
45 end := re.groups[tmp_index * 2 + 1]
46 return start, end
47 }
48 return -1, -1
49}
50
51// get_group_by_name get a group boundaries by its name
52pub fn (re RE) get_group_by_name(in_txt string, group_name string) string {
53 if group_name in re.group_map {
54 tmp_index := re.group_map[group_name] - 1
55 start := re.groups[tmp_index * 2]
56 end := re.groups[tmp_index * 2 + 1]
57 if start >= 0 && end > start {
58 return in_txt[start..end]
59 }
60 }
61 return ''
62}
63
64// get_group_by_id get a group string by its id
65pub fn (re RE) get_group_by_id(in_txt string, group_id int) string {
66 if group_id < (re.groups.len >> 1) {
67 index := group_id * 2
68 start := re.groups[index]
69 end := re.groups[index + 1]
70 if start >= 0 && end > start {
71 return in_txt[start..end]
72 }
73 }
74 return ''
75}
76
77// get_group_by_id get a group boundaries by its id
78pub fn (re RE) get_group_bounds_by_id(group_id int) (int, int) {
79 if group_id < re.group_count {
80 index := group_id * 2
81 return re.groups[index], re.groups[index + 1]
82 }
83 return -1, -1
84}
85
86pub struct Re_group {
87pub:
88 start int = -1
89 end int = -1
90}
91
92// get_group_list return a list of Re_group for the found groups
93pub fn (re RE) get_group_list() []Re_group {
94 mut res := []Re_group{len: re.groups.len >> 1}
95 mut gi := 0
96 // println("len: ${re.groups.len} groups: ${re.groups}")
97
98 for gi < re.groups.len {
99 if re.groups[gi] >= 0 {
100 txt_st := re.groups[gi]
101 txt_en := re.groups[gi + 1]
102
103 // println("#${gi/2} start: ${re.groups[gi]} end: ${re.groups[gi + 1]} ")
104 if txt_st >= 0 && txt_en > txt_st {
105 tmp := Re_group{
106 start: re.groups[gi]
107 end: re.groups[gi + 1]
108 }
109 // println(tmp)
110 res[gi >> 1] = tmp
111 } else {
112 res[gi >> 1] = Re_group{}
113 }
114 }
115 gi += 2
116 }
117 return res
118}
119
120/******************************************************************************
121*
122* Matchers
123*
124******************************************************************************/
125// match_string Match the pattern with the in_txt string
126[direct_array_access]
127pub fn (re &RE) match_string(in_txt string) (int, int) {
128 unsafe {
129 start, mut end := re.match_base(in_txt.str, in_txt.len + 1)
130 if end > in_txt.len {
131 end = in_txt.len
132 }
133
134 if start >= 0 && end > start {
135 if (re.flag & f_ms) != 0 && start > 0 {
136 return no_match_found, 0
137 }
138 if (re.flag & f_me) != 0 && end < in_txt.len {
139 if in_txt[end] in new_line_list {
140 return start, end
141 }
142 return no_match_found, 0
143 }
144 return start, end
145 }
146 return start, end
147 }
148}
149
150// matches_string Checks if the pattern matches the in_txt string
151pub fn (re &RE) matches_string(in_txt string) bool {
152 start, _ := re.match_string(in_txt)
153 return start != no_match_found
154}
155
156/******************************************************************************
157*
158* Finders
159*
160******************************************************************************/
161/*
162// find internal implementation HERE for reference do not remove!!
163[direct_array_access]
164fn (mut re RE) find_imp(in_txt string) (int,int) {
165 old_flag := re.flag
166 re.flag |= f_src // enable search mode
167
168 start, mut end := re.match_base(in_txt.str, in_txt.len + 1)
169 //print("Find [$start,$end] '${in_txt[start..end]}'")
170 if end > in_txt.len {
171 end = in_txt.len
172 }
173 re.flag = old_flag
174
175 if start >= 0 && end > start {
176 return start, end
177 }
178 return no_match_found, 0
179}
180*/
181
182// find try to find the first match in the input string
183[direct_array_access]
184pub fn (mut re RE) find(in_txt string) (int, int) {
185 // old_flag := re.flag
186 // re.flag |= f_src // enable search mode
187
188 mut i := 0
189 for i < in_txt.len {
190 mut s := -1
191 mut e := -1
192 unsafe {
193 // tmp_str := tos(in_txt.str + i, in_txt.len - i)
194 // println("Check: [$tmp_str]")
195 s, e = re.match_base(in_txt.str + i, in_txt.len - i + 1)
196
197 if s >= 0 && e > s {
198 // println("find match in: ${i+s},${i+e} [${in_txt[i+s..i+e]}]")
199 // re.flag = old_flag
200 mut gi := 0
201 for gi < re.groups.len {
202 re.groups[gi] += i
203 gi++
204 }
205 // when ^ (f_ms) is used, it must match on beginning of string
206 if (re.flag & f_ms) != 0 && s > 0 {
207 break
208 }
209 // when $ (f_me) is used, it must match on ending of string
210 if (re.flag & f_me) != 0 && i + e < in_txt.len {
211 break
212 }
213 return i + s, i + e
214 }
215 i++
216 }
217 }
218 // re.flag = old_flag
219 return -1, -1
220}
221
222// find try to find the first match in the input string strarting from start index
223[direct_array_access]
224pub fn (mut re RE) find_from(in_txt string, start int) (int, int) {
225 old_flag := re.flag
226 // re.flag |= f_src // enable search mode
227
228 mut i := start
229 if i < 0 {
230 return -1, -1
231 }
232 for i < in_txt.len {
233 //--- speed references ---
234
235 mut s := -1
236 mut e := -1
237
238 unsafe {
239 tmp_str := tos(in_txt.str + i, in_txt.len - i)
240 s, e = re.match_string(tmp_str)
241 }
242 //------------------------
243 // s,e = re.find_imp(in_txt[i..])
244 //------------------------
245 if s >= 0 && e > s {
246 // println("find match in: ${i+s},${i+e} [${in_txt[i+s..i+e]}]")
247 re.flag = old_flag
248 mut gi := 0
249 for gi < re.groups.len {
250 re.groups[gi] += i
251 gi++
252 }
253 return i + s, i + e
254 } else {
255 i++
256 }
257 }
258 re.flag = old_flag
259 return -1, -1
260}
261
262// find_all find all the non overlapping occurrences of the match pattern and return the start and end index of the match
263//
264// Usage:
265// ```v
266// blurb := 'foobar boo steelbar toolbox foot tooooot'
267// mut re := regex.regex_opt('f|t[eo]+')?
268// res := re.find_all(blurb) // [0, 3, 12, 15, 20, 23, 28, 31, 33, 39]
269// ```
270[direct_array_access]
271pub fn (mut re RE) find_all(in_txt string) []int {
272 // old_flag := re.flag
273 // re.flag |= f_src // enable search mode
274
275 mut i := 0
276 mut res := []int{}
277
278 for i < in_txt.len {
279 mut s := -1
280 mut e := -1
281 unsafe {
282 // tmp_str := in_txt[i..]
283 // tmp_str := tos(in_txt.str + i, in_txt.len - i)
284 // println("Check: [$tmp_str]")
285 s, e = re.match_base(in_txt.str + i, in_txt.len + 1 - i)
286
287 if s >= 0 && e > s {
288 res << i + s
289 res << i + e
290 i += e
291 continue
292 }
293 /*
294 if e > 0 {
295 i += e
296 continue
297 }
298 */
299 i++
300 }
301 }
302 // re.flag = old_flag
303 return res
304}
305
306// split returns the sections of string around the regex
307//
308// Usage:
309// ```v
310// blurb := 'foobar boo steelbar toolbox foot tooooot'
311// mut re := regex.regex_opt('f|t[eo]+')?
312// res := re.split(blurb) // ['bar boo s', 'lbar ', 'lbox ', 't ', 't']
313// ```
314pub fn (mut re RE) split(in_txt string) []string {
315 pos := re.find_all(in_txt)
316
317 mut sections := []string{cap: pos.len / 2 + 1}
318
319 if pos.len == 0 {
320 return [in_txt]
321 }
322 for i := 0; i < pos.len; i += 2 {
323 if pos[i] == 0 {
324 continue
325 }
326 if i == 0 {
327 sections << in_txt[..pos[i]]
328 } else {
329 sections << in_txt[pos[i - 1]..pos[i]]
330 }
331 }
332 if pos[pos.len - 1] != in_txt.len {
333 sections << in_txt[pos[pos.len - 1]..]
334 }
335 return sections
336}
337
338// find_all_str find all the non overlapping occurrences of the match pattern, return a string list
339[direct_array_access]
340pub fn (mut re RE) find_all_str(in_txt string) []string {
341 // old_flag := re.flag
342 // re.flag |= f_src // enable search mode
343
344 mut i := 0
345 mut res := []string{}
346
347 for i < in_txt.len {
348 mut s := -1
349 mut e := -1
350 unsafe {
351 // tmp_str := in_txt[i..]
352 // tmp_str := tos(in_txt.str + i, in_txt.len - i)
353 // println("Check: [$tmp_str]")
354 s, e = re.match_base(in_txt.str + i, in_txt.len + 1 - i)
355
356 if s >= 0 && e > s {
357 tmp_str := tos(in_txt.str + i, in_txt.len - i)
358 mut tmp_e := if e > tmp_str.len { tmp_str.len } else { e }
359 // println("Found: $s:$e [${tmp_str[s..e]}]")
360 res << tmp_str[..tmp_e]
361 i += e
362 continue
363 }
364 }
365 /*
366 if e > 0 {
367 i += e
368 continue
369 }
370 */
371 i++
372 }
373 // re.flag = old_flag
374 return res
375}
376
377/******************************************************************************
378*
379* Replacers
380*
381******************************************************************************/
382// replace_simple return a string where the matches are replaced with the replace string
383pub fn (mut re RE) replace_simple(in_txt string, repl string) string {
384 pos := re.find_all(in_txt)
385
386 if pos.len > 0 {
387 mut res := ''
388 mut i := 0
389
390 mut s1 := 0
391 mut e1 := in_txt.len
392
393 for i < pos.len {
394 e1 = pos[i]
395 res += in_txt[s1..e1] + repl
396 s1 = pos[i + 1]
397 i += 2
398 }
399
400 res += in_txt[s1..]
401 return res
402 }
403 return in_txt
404}
405
406// type of function used for custom replace
407// in_txt source text
408// start index of the start of the match in in_txt
409// end index of the end of the match in in_txt
410// the match is in in_txt[start..end]
411pub type FnReplace = fn (re RE, in_txt string, start int, end int) string
412
413// replace_by_fn return a string where the matches are replaced with the string from the repl_fn callback function
414pub fn (mut re RE) replace_by_fn(in_txt string, repl_fn FnReplace) string {
415 mut i := 0
416 mut res := strings.new_builder(in_txt.len)
417 mut last_end := 0
418
419 for i < in_txt.len {
420 // println("Find Start. $i [${in_txt[i..]}]")
421 s, e := re.find_from(in_txt, i)
422 // println("Find End.")
423 if s >= 0 && e > s {
424 // println("find match in: ${s},${e} [${in_txt[s..e]}]")
425
426 if last_end < s {
427 res.write_string(in_txt[last_end..s])
428 }
429 /*
430 for g_i in 0 .. re.group_count {
431 re.groups[g_i * 2] += i
432 re.groups[(g_i * 2) + 1] += i
433 }
434 */
435 repl := repl_fn(re, in_txt, s, e)
436 // println("repl res: $repl")
437 res.write_string(repl)
438 // res.write_string("[[${in_txt[s..e]}]]")
439
440 last_end = e
441 i = e
442 } else {
443 break
444 // i++
445 }
446 // println(i)
447 }
448 if last_end >= 0 && last_end < in_txt.len {
449 res.write_string(in_txt[last_end..])
450 }
451 return res.str()
452}
453
454fn (re RE) parsed_replace_string(in_txt string, repl string) string {
455 str_lst := repl.split('\\')
456 mut res := str_lst[0]
457 mut i := 1
458 for i < str_lst.len {
459 tmp := str_lst[i]
460 // println("tmp: ${tmp}")
461 if tmp.len > 0 && tmp[0] >= `0` && tmp[0] <= `9` {
462 group_id := int(tmp[0] - `0`)
463 group := re.get_group_by_id(in_txt, group_id)
464 // println("group: $group_id [$group]")
465 res += '${group}${tmp[1..]}'
466 } else {
467 res += '\\' + tmp
468 }
469 i++
470 }
471 return res
472}
473
474// replace return a string where the matches are replaced with the repl_str string,
475// this function support use groups in the replace string
476pub fn (mut re RE) replace(in_txt string, repl_str string) string {
477 mut i := 0
478 mut res := strings.new_builder(in_txt.len)
479 mut last_end := 0
480
481 for i < in_txt.len {
482 // println("Find Start. $i [${in_txt[i..]}]")
483 s, e := re.find_from(in_txt, i)
484 // println("Find End.")
485 if s >= 0 && e > s {
486 // println("find match in: ${s},${e} [${in_txt[s..e]}]")
487
488 if last_end < s {
489 res.write_string(in_txt[last_end..s])
490 }
491 /*
492 for g_i in 0 .. re.group_count {
493 re.groups[g_i * 2] += i
494 re.groups[(g_i * 2) + 1] += i
495 }
496 */
497 // repl := repl_fn(re, in_txt, s, e)
498 repl := re.parsed_replace_string(in_txt, repl_str)
499 // println("repl res: $repl")
500 res.write_string(repl)
501 // res.write_string("[[${in_txt[s..e]}]]")
502
503 last_end = e
504 i = e
505 } else {
506 break
507 // i++
508 }
509 // println(i)
510 }
511 if last_end >= 0 && last_end < in_txt.len {
512 res.write_string(in_txt[last_end..])
513 }
514 return res.str()
515}
516
517// replace_n return a string where the firts count matches are replaced with the repl_str string,
518// if count is > 0 the replace began from the start of the string toward the end
519// if count is < 0 the replace began from the end of the string toward the start
520// if count is 0 do nothing
521pub fn (mut re RE) replace_n(in_txt string, repl_str string, count int) string {
522 mut i := 0
523 mut index := 0
524 mut i_p := 0
525 mut res := strings.new_builder(in_txt.len)
526 mut lst := re.find_all(in_txt)
527
528 if count < 0 { // start from the right of the string
529 lst = unsafe { lst#[count * 2..] } // limitate the number of substitions
530 } else if count > 0 { // start from the left of the string
531 lst = unsafe { lst#[..count * 2] } // limitate the number of substitions
532 } else if count == 0 { // no replace
533 return in_txt
534 }
535
536 // println("found: ${lst}")
537 for index < lst.len {
538 i = lst[index]
539 res.write_string(in_txt[i_p..i])
540 res.write_string(repl_str)
541 index++
542 i_p = lst[index]
543 index++
544 }
545 i = i_p
546 res.write_string(in_txt[i..])
547
548 return res.str()
549}