v / vlib / regex
Raw file | 2686 loc (2356 sloc) | 67.57 KB | Latest commit hash da68b2d36
1/*
2regex 1.0 alpha
3
4Copyright (c) 2019-2023 Dario Deledda. All rights reserved.
5Use of this source code is governed by an MIT license
6that can be found in the LICENSE file.
7
8This file contains regex module
9
10Know limitation:
11- find is implemented in a trivial way
12- not full compliant PCRE
13- not compliant POSIX ERE
14*/
15module regex
16
17import strings
18
19pub const (
20 v_regex_version = '1.0 alpha' // regex module version
21
22 max_code_len = 256 // default small base code len for the regex programs
23 max_quantifier = 1073741824 // default max repetitions allowed for the quantifiers = 2^30
24 // spaces chars (here only westerns!!) TODO: manage all the spaces from unicode
25 spaces = [` `, `\t`, `\n`, `\r`, `\v`, `\f`]
26 // new line chars for now only '\n'
27 new_line_list = [`\n`, `\r`]
28
29 // Results
30 no_match_found = -1
31
32 // Errors
33 compile_ok = 0 // the regex string compiled, all ok
34 err_char_unknown = -2 // the char used is unknow to the system
35 err_undefined = -3 // the compiler symbol is undefined
36 err_internal_error = -4 // Bug in the regex system!!
37 err_cc_alloc_overflow = -5 // memory for char class full!!
38 err_syntax_error = -6 // syntax error in regex compiling
39 err_groups_overflow = -7 // max number of groups reached
40 err_groups_max_nested = -8 // max number of nested group reached
41 err_group_not_balanced = -9 // group not balanced
42 err_group_qm_notation = -10 // group invalid notation
43 err_invalid_or_with_cc = -11 // invalid or on two consecutive char class
44 err_neg_group_quantifier = -12 // negation groups can not have quantifier
45 err_consecutive_dots = -13 // two consecutive dots is an error
46)
47
48const (
49 //*************************************
50 // regex program instructions
51 //*************************************
52 ist_simple_char = u32(0x7FFFFFFF) // single char instruction, 31 bit available to char
53 // char class 11 0100 AA xxxxxxxx
54 // AA = 00 regular class
55 // AA = 01 Negated class ^ char
56 ist_char_class = u32(0xD1000000) // MASK
57 ist_char_class_pos = u32(0xD0000000) // char class normal [abc]
58 ist_char_class_neg = u32(0xD1000000) // char class negate [^abc]
59 // dot char 10 0110 xx xxxxxxxx
60 ist_dot_char = u32(0x98000000) // match any char except \n
61 // backslash chars 10 0100 xx xxxxxxxx
62 ist_bsls_char = u32(0x90000000) // backslash char
63 // OR | 10 010Y xx xxxxxxxx
64 ist_or_branch = u32(0x91000000) // OR case
65 // groups 10 010Y xx xxxxxxxx
66 ist_group_start = u32(0x92000000) // group start (
67 ist_group_end = u32(0x94000000) // group end )
68 // control instructions
69 ist_prog_end = u32(0x88000000) // 10 0010 xx xxxxxxxx
70 //*************************************
71)
72
73/*
74General Utilities
75*/
76// utf8util_char_len calculate the length in bytes of a utf8 char
77[inline]
78fn utf8util_char_len(b u8) int {
79 return ((0xe5000000 >> ((b >> 3) & 0x1e)) & 3) + 1
80}
81
82// get_char get a char from position i and return an u32 with the unicode code
83[direct_array_access; inline]
84fn (re RE) get_char(in_txt string, i int) (u32, int) {
85 ini := unsafe { in_txt.str[i] }
86 // ascii 8 bit
87 if (re.flag & regex.f_bin) != 0 || ini & 0x80 == 0 {
88 return u32(ini), 1
89 }
90 // unicode char
91 char_len := utf8util_char_len(ini)
92 mut tmp := 0
93 mut ch := u32(0)
94 for tmp < char_len {
95 ch = (ch << 8) | unsafe { in_txt.str[i + tmp] }
96 tmp++
97 }
98 return ch, char_len
99}
100
101// get_charb get a char from position i and return an u32 with the unicode code
102[direct_array_access; inline]
103fn (re RE) get_charb(in_txt &u8, i int) (u32, int) {
104 // ascii 8 bit
105 if (re.flag & regex.f_bin) != 0 || unsafe { in_txt[i] } & 0x80 == 0 {
106 return u32(unsafe { in_txt[i] }), 1
107 }
108 // unicode char
109 char_len := utf8util_char_len(unsafe { in_txt[i] })
110 mut tmp := 0
111 mut ch := u32(0)
112 for tmp < char_len {
113 ch = (ch << 8) | unsafe { in_txt[i + tmp] }
114 tmp++
115 }
116 return ch, char_len
117}
118
119[inline]
120fn is_alnum(in_char u8) bool {
121 mut tmp := in_char - `A`
122 if tmp <= 25 {
123 return true
124 }
125 tmp = in_char - `a`
126 if tmp <= 25 {
127 return true
128 }
129 tmp = in_char - `0`
130 if tmp <= 9 {
131 return true
132 }
133 if in_char == `_` {
134 return true
135 }
136 return false
137}
138
139[inline]
140fn is_not_alnum(in_char u8) bool {
141 return !is_alnum(in_char)
142}
143
144[inline]
145fn is_space(in_char u8) bool {
146 return in_char in regex.spaces
147}
148
149[inline]
150fn is_not_space(in_char u8) bool {
151 return !is_space(in_char)
152}
153
154[inline]
155fn is_digit(in_char u8) bool {
156 tmp := in_char - `0`
157 return tmp <= 0x09
158}
159
160[inline]
161fn is_not_digit(in_char u8) bool {
162 return !is_digit(in_char)
163}
164
165/*
166[inline]
167fn is_wordchar(in_char byte) bool {
168 return is_alnum(in_char) || in_char == `_`
169}
170
171[inline]
172fn is_not_wordchar(in_char byte) bool {
173 return !is_alnum(in_char)
174}
175*/
176
177[inline]
178fn is_lower(in_char u8) bool {
179 tmp := in_char - `a`
180 return tmp <= 25
181}
182
183[inline]
184fn is_upper(in_char u8) bool {
185 tmp := in_char - `A`
186 return tmp <= 25
187}
188
189pub fn (re RE) get_parse_error_string(err int) string {
190 match err {
191 regex.compile_ok { return 'compile_ok' }
192 regex.no_match_found { return 'no_match_found' }
193 regex.err_char_unknown { return 'err_char_unknown' }
194 regex.err_undefined { return 'err_undefined' }
195 regex.err_internal_error { return 'err_internal_error' }
196 regex.err_cc_alloc_overflow { return 'err_cc_alloc_overflow' }
197 regex.err_syntax_error { return 'err_syntax_error' }
198 regex.err_groups_overflow { return 'err_groups_overflow' }
199 regex.err_groups_max_nested { return 'err_groups_max_nested' }
200 regex.err_group_not_balanced { return 'err_group_not_balanced' }
201 regex.err_group_qm_notation { return 'err_group_qm_notation' }
202 regex.err_invalid_or_with_cc { return 'err_invalid_or_with_cc' }
203 regex.err_neg_group_quantifier { return 'err_neg_group_quantifier' }
204 regex.err_consecutive_dots { return 'err_consecutive_dots' }
205 else { return 'err_unknown' }
206 }
207}
208
209// utf8_str convert and utf8 sequence to a printable string
210[inline]
211fn utf8_str(ch rune) string {
212 mut i := 4
213 mut res := ''
214 for i > 0 {
215 v := u8((ch >> ((i - 1) * 8)) & 0xFF)
216 if v != 0 {
217 res += '${v:1c}'
218 }
219 i--
220 }
221 return res
222}
223
224// simple_log default log function
225fn simple_log(txt string) {
226 print(txt)
227}
228
229/******************************************************************************
230*
231* Token Structs
232*
233******************************************************************************/
234pub type FnValidator = fn (u8) bool
235
236struct Token {
237mut:
238 ist rune
239 // char
240 ch rune // char of the token if any
241 ch_len u8 // char len
242 // Quantifiers / branch
243 rep_min int // used also for jump next in the OR branch [no match] pc jump
244 rep_max int // used also for jump next in the OR branch [ match] pc jump
245 greedy bool // greedy quantifier flag
246 // Char class
247 cc_index int = -1
248 // counters for quantifier check (repetitions)
249 rep int
250 // validator function pointer
251 validator FnValidator = unsafe { nil }
252 // groups variables
253 group_neg bool // negation flag for the group, 0 => no negation > 0 => negataion
254 group_rep int // repetition of the group
255 group_id int = -1 // id of the group
256 goto_pc int = -1 // jump to this PC if is needed
257 // OR flag for the token
258 next_is_or bool // true if the next token is an OR
259 // dot_char token variables
260 dot_check_pc int = -1 // pc of the next token to check for dots
261 bsls_check_pc int = -1 // pc of the next token to check for bsls
262 cc_check_pc int = -1 // pc of the next token to check for CC
263 last_dot_flag bool // if true indicate that is the last dot_char in the regex
264 // debug fields
265 source_index int
266}
267
268[inline]
269fn (mut tok Token) reset() {
270 tok.rep = 0
271}
272
273/******************************************************************************
274*
275* Regex struct
276*
277******************************************************************************/
278pub const (
279 f_nl = 0x00000001 // end the match when find a new line symbol
280 f_ms = 0x00000002 // match true only if the match is at the start of the string
281 f_me = 0x00000004 // match true only if the match is at the end of the string
282
283 f_efm = 0x00000100 // exit on first token matched, used by search
284 f_bin = 0x00000200 // work only on bytes, ignore utf-8
285 // behaviour modifier flags
286 f_src = 0x00020000 // search mode enabled
287)
288
289// Log function prototype
290pub type FnLog = fn (string)
291
292pub struct RE {
293pub mut:
294 prog []Token
295 prog_len int // regex program len
296 // char classes storage
297 cc []CharClass // char class list
298 cc_index int // index
299 // groups
300 group_count int // number of groups in this regex struct
301 groups []int // groups index results
302 group_max_nested int = 3 // max nested group
303 group_max int = 8 // max allowed number of different groups
304
305 state_list []StateObj
306
307 group_csave_flag bool // flag to enable continuous saving
308 group_csave []int //= []int{} // groups continuous save list
309
310 group_map map[string]int // groups names map
311
312 group_stack []int
313 group_data []int
314 // flags
315 flag int // flag for optional parameters
316 // Debug/log
317 debug int // enable in order to have the unroll of the code 0 = NO_DEBUG, 1 = LIGHT 2 = VERBOSE
318 log_func FnLog = simple_log // log function, can be customized by the user
319 query string // query string
320}
321
322// Reset RE object
323[direct_array_access; inline]
324pub fn (mut re RE) reset() {
325 re.cc_index = 0
326
327 mut i := 0
328 for i < re.prog_len {
329 re.prog[i].group_rep = 0 // clear repetition of the group
330 re.prog[i].rep = 0 // clear repetition of the token
331 i++
332 }
333
334 // init groups array
335 if re.group_count > 0 {
336 if re.groups.len == 0 {
337 // first run alloc memory
338 re.groups = []int{len: re.group_count * 2, init: -1}
339 } else {
340 // subsequent executions, only clean up the memory
341 i = 0
342 for i < re.groups.len {
343 re.groups[i] = -1
344 i++
345 }
346 }
347 }
348
349 // reset group_csave
350 if re.group_csave_flag == true {
351 re.group_csave.clear() // = []int{}
352 }
353
354 // reset state list
355 re.state_list.clear()
356 re.group_stack.clear()
357}
358
359// reset for search mode fail
360// gcc bug, dont use [inline] or go 5 time slower
361//[inline]
362[direct_array_access]
363fn (mut re RE) reset_src() {
364 mut i := 0
365 for i < re.prog_len {
366 re.prog[i].group_rep = 0 // clear repetition of the group
367 re.prog[i].rep = 0 // clear repetition of the token
368 i++
369 }
370}
371
372/******************************************************************************
373*
374* Backslashes chars
375*
376******************************************************************************/
377struct BslsStruct {
378 ch rune // meta char
379 validator FnValidator = unsafe { nil } // validator function pointer
380}
381
382const (
383 bsls_validator_array = [
384 BslsStruct{`w`, is_alnum},
385 BslsStruct{`W`, is_not_alnum},
386 BslsStruct{`s`, is_space},
387 BslsStruct{`S`, is_not_space},
388 BslsStruct{`d`, is_digit},
389 BslsStruct{`D`, is_not_digit},
390 BslsStruct{`a`, is_lower},
391 BslsStruct{`A`, is_upper},
392 ]
393
394 // these chars are escape if preceded by a \
395 bsls_escape_list = [`\\`, `|`, `.`, `:`, `*`, `+`, `-`, `{`, `}`, `[`, `]`, `(`, `)`, `?`,
396 `^`, `!`]
397)
398
399enum BSLS_parse_state {
400 start
401 bsls_found
402 bsls_char
403 normal_char
404}
405
406// parse_bsls return (index, str_len) bsls_validator_array index, len of the backslash sequence if present
407fn (re RE) parse_bsls(in_txt string, in_i int) (int, int) {
408 mut status := BSLS_parse_state.start
409 mut i := in_i
410
411 for i < in_txt.len {
412 // get our char
413 char_tmp, char_len := re.get_char(in_txt, i)
414 ch := u8(char_tmp)
415
416 if status == .start && ch == `\\` {
417 status = .bsls_found
418 i += char_len
419 continue
420 }
421
422 // check if is our bsls char, for now only one length sequence
423 if status == .bsls_found {
424 for c, x in regex.bsls_validator_array {
425 if x.ch == ch {
426 return c, i - in_i + 1
427 }
428 }
429 status = .normal_char
430 continue
431 }
432
433 // no BSLS validator, manage as normal escape char char
434 if status == .normal_char {
435 if ch in regex.bsls_escape_list {
436 return regex.no_match_found, i - in_i + 1
437 }
438 return regex.err_syntax_error, i - in_i + 1
439 }
440
441 // at the present time we manage only one char after the \
442 break
443 }
444 // not our bsls return KO
445 return regex.err_syntax_error, i
446}
447
448/******************************************************************************
449*
450* Char class
451*
452******************************************************************************/
453const (
454 cc_null = 0 // empty cc token
455 cc_char = 1 // simple char: a
456 cc_int = 2 // char interval: a-z
457 cc_bsls = 3 // backslash char
458 cc_end = 4 // cc sequence terminator
459)
460
461struct CharClass {
462mut:
463 cc_type int // type of cc token
464 ch0 rune // first char of the interval a-b a in this case
465 ch1 rune // second char of the interval a-b b in this case
466 validator FnValidator = unsafe { nil } // validator function pointer
467}
468
469enum CharClass_parse_state {
470 start
471 in_char
472 in_bsls
473 separator
474 finish
475}
476
477fn (re RE) get_char_class(pc int) string {
478 buf := []u8{len: (re.cc.len)}
479 mut buf_ptr := unsafe { &u8(&buf) }
480
481 mut cc_i := re.prog[pc].cc_index
482 mut i := 0
483 mut tmp := 0
484 for cc_i >= 0 && cc_i < re.cc.len && re.cc[cc_i].cc_type != regex.cc_end {
485 if re.cc[cc_i].cc_type == regex.cc_bsls {
486 unsafe {
487 buf_ptr[i] = `\\`
488 i++
489 buf_ptr[i] = u8(re.cc[cc_i].ch0)
490 i++
491 }
492 } else if re.cc[cc_i].ch0 == re.cc[cc_i].ch1 {
493 tmp = 3
494 for tmp >= 0 {
495 x := u8((re.cc[cc_i].ch0 >> (tmp * 8)) & 0xFF)
496 if x != 0 {
497 unsafe {
498 buf_ptr[i] = x
499 i++
500 }
501 }
502 tmp--
503 }
504 } else {
505 tmp = 3
506 for tmp >= 0 {
507 x := u8((re.cc[cc_i].ch0 >> (tmp * 8)) & 0xFF)
508 if x != 0 {
509 unsafe {
510 buf_ptr[i] = x
511 i++
512 }
513 }
514 tmp--
515 }
516 unsafe {
517 buf_ptr[i] = `-`
518 i++
519 }
520 tmp = 3
521 for tmp >= 0 {
522 x := u8((re.cc[cc_i].ch1 >> (tmp * 8)) & 0xFF)
523 if x != 0 {
524 unsafe {
525 buf_ptr[i] = x
526 i++
527 }
528 }
529 tmp--
530 }
531 }
532 cc_i++
533 }
534 unsafe {
535 buf_ptr[i] = u8(0)
536 }
537 return unsafe { tos_clone(buf_ptr) }
538}
539
540fn (re RE) check_char_class(pc int, ch rune) bool {
541 mut cc_i := re.prog[pc].cc_index
542 for cc_i >= 0 && cc_i < re.cc.len && re.cc[cc_i].cc_type != regex.cc_end {
543 if re.cc[cc_i].cc_type == regex.cc_bsls {
544 if re.cc[cc_i].validator(u8(ch)) {
545 return true
546 }
547 } else if ch >= re.cc[cc_i].ch0 && ch <= re.cc[cc_i].ch1 {
548 return true
549 }
550 cc_i++
551 }
552 return false
553}
554
555// parse_char_class return (index, str_len, cc_type) of a char class [abcm-p], char class start after the [ char
556fn (mut re RE) parse_char_class(in_txt string, in_i int) (int, int, rune) {
557 mut status := CharClass_parse_state.start
558 mut i := in_i
559
560 mut tmp_index := re.cc_index
561 res_index := re.cc_index
562
563 mut cc_type := u32(regex.ist_char_class_pos)
564
565 for i < in_txt.len {
566 // check if we are out of memory for char classes
567 if tmp_index >= re.cc.len {
568 return regex.err_cc_alloc_overflow, 0, u32(0)
569 }
570
571 // get our char
572 char_tmp, char_len := re.get_char(in_txt, i)
573 ch := u8(char_tmp)
574
575 // println("CC #${i:3d} ch: ${ch:c}")
576
577 // negation
578 if status == .start && ch == `^` {
579 cc_type = u32(regex.ist_char_class_neg)
580 i += char_len
581 continue
582 }
583
584 // minus symbol
585 if status == .start && ch == `-` {
586 re.cc[tmp_index].cc_type = regex.cc_char
587 re.cc[tmp_index].ch0 = char_tmp
588 re.cc[tmp_index].ch1 = char_tmp
589 i += char_len
590 tmp_index++
591 continue
592 }
593
594 // bsls
595 if (status == .start || status == .in_char) && ch == `\\` {
596 // println("CC bsls.")
597 status = .in_bsls
598 i += char_len
599 continue
600 }
601
602 if status == .in_bsls {
603 // println("CC bsls validation.")
604 for c, x in regex.bsls_validator_array {
605 if x.ch == ch {
606 // println("CC bsls found [${ch:c}]")
607 re.cc[tmp_index].cc_type = regex.cc_bsls
608 re.cc[tmp_index].ch0 = regex.bsls_validator_array[c].ch
609 re.cc[tmp_index].ch1 = regex.bsls_validator_array[c].ch
610 re.cc[tmp_index].validator = regex.bsls_validator_array[c].validator
611 i += char_len
612 tmp_index++
613 status = .in_char
614 break
615 }
616 }
617 if status == .in_bsls {
618 // manage as a simple char
619 // println("CC bsls not found [${ch:c}]")
620 re.cc[tmp_index].cc_type = regex.cc_char
621 re.cc[tmp_index].ch0 = char_tmp
622 re.cc[tmp_index].ch1 = char_tmp
623 i += char_len
624 tmp_index++
625 status = .in_char
626 continue
627 } else {
628 continue
629 }
630 }
631
632 // simple char
633 if (status == .start || status == .in_char) && ch != `-` && ch != `]` {
634 status = .in_char
635
636 re.cc[tmp_index].cc_type = regex.cc_char
637 re.cc[tmp_index].ch0 = char_tmp
638 re.cc[tmp_index].ch1 = char_tmp
639
640 i += char_len
641 tmp_index++
642 continue
643 }
644
645 // check range separator
646 if status == .in_char && ch == `-` {
647 status = .separator
648 i += char_len
649 continue
650 }
651
652 // check range end
653 if status == .separator && ch != `]` && ch != `-` {
654 status = .in_char
655 re.cc[tmp_index - 1].cc_type = regex.cc_int
656 re.cc[tmp_index - 1].ch1 = char_tmp
657 i += char_len
658 continue
659 }
660
661 // char class end
662 if status == .in_char && ch == `]` {
663 re.cc[tmp_index].cc_type = regex.cc_end
664 re.cc[tmp_index].ch0 = 0
665 re.cc[tmp_index].ch1 = 0
666 re.cc_index = tmp_index + 1
667
668 return res_index, i - in_i + 2, cc_type
669 }
670
671 i++
672 }
673 return regex.err_syntax_error, 0, u32(0)
674}
675
676/******************************************************************************
677*
678* Re Compiler
679*
680******************************************************************************/
681//
682// Quantifier
683//
684enum Quant_parse_state {
685 start
686 min_parse
687 comma_checked
688 max_parse
689 greedy
690 gredy_parse
691 finish
692}
693
694// parse_quantifier return (min, max, str_len, greedy_flag) of a {min,max}? quantifier starting after the { char
695fn (re RE) parse_quantifier(in_txt string, in_i int) (int, int, int, bool) {
696 mut status := Quant_parse_state.start
697 mut i := in_i
698
699 mut q_min := 0 // default min in a {} quantifier is 1
700 mut q_max := 0 // default max in a {} quantifier is max_quantifier
701
702 mut ch := u8(0)
703
704 for i < in_txt.len {
705 unsafe {
706 ch = in_txt.str[i]
707 }
708 // println("${ch:c} status: $status")
709
710 // exit on no compatible char with {} quantifier
711 if utf8util_char_len(ch) != 1 {
712 return regex.err_syntax_error, i, 0, false
713 }
714
715 // min parsing skip if comma present
716 if status == .start && ch == `,` {
717 q_min = 0 // default min in a {} quantifier is 0
718 status = .comma_checked
719 i++
720 continue
721 }
722
723 if status == .start && is_digit(ch) {
724 status = .min_parse
725 q_min *= 10
726 q_min += int(ch - `0`)
727 i++
728 continue
729 }
730
731 if status == .min_parse && is_digit(ch) {
732 q_min *= 10
733 q_min += int(ch - `0`)
734 i++
735 continue
736 }
737
738 // we have parsed the min, now check the max
739 if status == .min_parse && ch == `,` {
740 status = .comma_checked
741 i++
742 continue
743 }
744
745 // single value {4}
746 if status == .min_parse && ch == `}` {
747 q_max = q_min
748 status = .greedy
749 continue
750 }
751
752 // end without max
753 if status == .comma_checked && ch == `}` {
754 q_max = regex.max_quantifier
755 status = .greedy
756 continue
757 }
758
759 // start max parsing
760 if status == .comma_checked && is_digit(ch) {
761 status = .max_parse
762 q_max *= 10
763 q_max += int(ch - `0`)
764 i++
765 continue
766 }
767
768 // parse the max
769 if status == .max_parse && is_digit(ch) {
770 q_max *= 10
771 q_max += int(ch - `0`)
772 i++
773 continue
774 }
775
776 // finished the quantifier
777 if status == .max_parse && ch == `}` {
778 status = .greedy
779 continue
780 }
781
782 // check if greedy flag char ? is present
783 if status == .greedy {
784 if i + 1 < in_txt.len {
785 i++
786 status = .gredy_parse
787 continue
788 }
789 return q_min, q_max, i - in_i + 2, false
790 }
791
792 // check the greedy flag
793 if status == .gredy_parse {
794 if ch == `?` {
795 return q_min, q_max, i - in_i + 2, true
796 } else {
797 i--
798 return q_min, q_max, i - in_i + 2, false
799 }
800 }
801
802 // not a {} quantifier, exit
803 return regex.err_syntax_error, i, 0, false
804 }
805
806 // not a conform {} quantifier
807 return regex.err_syntax_error, i, 0, false
808}
809
810//
811// Groups
812//
813enum Group_parse_state {
814 start
815 q_mark // (?
816 q_mark1 // (?:|P checking
817 p_status // (?P
818 p_start // (?P<
819 p_end // (?P<...>
820 p_in_name // (?P<...
821 finish
822}
823
824// parse_groups parse a group for ? (question mark) syntax, if found, return (error, capture_flag, negate_flag, name_of_the_group, next_index)
825fn (re RE) parse_groups(in_txt string, in_i int) (int, bool, bool, string, int) {
826 mut status := Group_parse_state.start
827 mut i := in_i
828 mut name := ''
829
830 for i < in_txt.len && status != .finish {
831 // get our char
832 char_tmp, char_len := re.get_char(in_txt, i)
833 ch := u8(char_tmp)
834
835 // start
836 if status == .start && ch == `(` {
837 status = .q_mark
838 i += char_len
839 continue
840 }
841
842 // check for question marks
843 if status == .q_mark && ch == `?` {
844 status = .q_mark1
845 i += char_len
846 continue
847 }
848
849 // negate group
850 if status == .q_mark1 && ch == `!` {
851 i += char_len
852 return 0, false, true, name, i
853 }
854
855 // non capturing group
856 if status == .q_mark1 && ch == `:` {
857 i += char_len
858 return 0, false, false, name, i
859 }
860
861 // enter in P section
862 if status == .q_mark1 && ch == `P` {
863 status = .p_status
864 i += char_len
865 continue
866 }
867
868 // not a valid q mark found
869 if status == .q_mark1 {
870 // println("NO VALID Q MARK")
871 return -2, true, false, name, i
872 }
873
874 if status == .p_status && ch == `<` {
875 status = .p_start
876 i += char_len
877 continue
878 }
879
880 if status == .p_start && ch != `>` {
881 status = .p_in_name
882 name += '${ch:1c}' // TODO: manage utf8 chars
883 i += char_len
884 continue
885 }
886
887 // colect name
888 if status == .p_in_name && ch != `>` && is_alnum(ch) {
889 name += '${ch:1c}' // TODO: manage utf8 chars
890 i += char_len
891 continue
892 }
893
894 // end name
895 if status == .p_in_name && ch == `>` {
896 i += char_len
897 return 0, true, false, name, i
898 }
899
900 // error on name group
901 if status == .p_in_name {
902 return -2, true, false, name, i
903 }
904
905 // normal group, nothig to do, exit
906 return 0, true, false, name, i
907 }
908 // UNREACHABLE
909 // println("ERROR!! NOT MEANT TO BE HERE!!1")
910 return -2, true, false, name, i
911}
912
913const (
914 quntifier_chars = [rune(`+`), `*`, `?`, `{`]
915)
916
917//
918// main compiler
919//
920// compile return (return code, index) where index is the index of the error in the query string if return code is an error code
921fn (mut re RE) impl_compile(in_txt string) (int, int) {
922 mut i := 0 // input string index
923 mut pc := 0 // program counter
924
925 // group management variables
926 mut group_count := -1
927 mut group_stack := []int{len: re.group_max_nested, init: 0}
928 mut group_stack_txt_index := []int{len: re.group_max_nested, init: -1}
929 mut group_stack_index := -1
930
931 re.query = in_txt // save the query string
932
933 i = 0
934 for i < in_txt.len {
935 mut char_tmp := u32(0)
936 mut char_len := 0
937 // println("i: ${i:3d} ch: ${in_txt.str[i]:c}")
938
939 char_tmp, char_len = re.get_char(in_txt, i)
940
941 //
942 // check special cases: $ ^
943 //
944 if char_len == 1 && i == 0 && u8(char_tmp) == `^` {
945 re.flag = regex.f_ms
946 i = i + char_len
947 continue
948 }
949 if char_len == 1 && i == (in_txt.len - 1) && u8(char_tmp) == `$` {
950 re.flag = regex.f_me
951 i = i + char_len
952 continue
953 }
954
955 // ist_group_start
956 if char_len == 1 && pc >= 0 && u8(char_tmp) == `(` {
957 // check max groups allowed
958 if group_count > re.group_max {
959 return regex.err_groups_overflow, i + 1
960 }
961 group_stack_index++
962
963 // check max nested groups allowed
964 if group_stack_index > re.group_max_nested {
965 return regex.err_groups_max_nested, i + 1
966 }
967
968 tmp_res, cgroup_flag, negate_flag, cgroup_name, next_i := re.parse_groups(in_txt,
969 i)
970
971 // manage question mark format error
972 if tmp_res < -1 {
973 return regex.err_group_qm_notation, next_i
974 }
975
976 // println("Parse group: [$tmp_res, $cgroup_flag, ($i,$next_i), '${in_txt[i..next_i]}' ]")
977 i = next_i
978
979 if cgroup_flag == true {
980 group_count++
981 }
982
983 // calculate the group id
984 // if it is a named group, recycle the group id
985 // NOTE: **** the group index is +1 because map return 0 when not found!! ****
986 mut group_id := group_count
987 if cgroup_name.len > 0 {
988 // println("GROUP NAME: ${cgroup_name}")
989 if cgroup_name in re.group_map {
990 group_id = re.group_map[cgroup_name] - 1
991 group_count--
992 } else {
993 re.group_map[cgroup_name] = group_id + 1
994 }
995 }
996
997 group_stack_txt_index[group_stack_index] = i
998 group_stack[group_stack_index] = pc
999
1000 re.prog[pc].ist = u32(0) | regex.ist_group_start
1001 re.prog[pc].rep_min = 1
1002 re.prog[pc].rep_max = 1
1003
1004 // manage negation groups
1005 if negate_flag == true {
1006 re.prog[pc].group_neg = true
1007 re.prog[pc].rep_min = 0 // may be not catched, but it is ok
1008 }
1009
1010 // set the group id
1011 if cgroup_flag == false {
1012 // println("NO CAPTURE GROUP")
1013 re.prog[pc].group_id = -1
1014 } else {
1015 re.prog[pc].group_id = group_id
1016 }
1017
1018 pc = pc + 1
1019 continue
1020 }
1021
1022 // ist_group_end
1023 if char_len == 1 && pc > 0 && u8(char_tmp) == `)` {
1024 if group_stack_index < 0 {
1025 return regex.err_group_not_balanced, i + 1
1026 }
1027
1028 goto_pc := group_stack[group_stack_index]
1029 group_stack_index--
1030
1031 re.prog[pc].ist = u32(0) | regex.ist_group_end
1032 re.prog[pc].rep_min = 1
1033 re.prog[pc].rep_max = 1
1034
1035 re.prog[pc].goto_pc = goto_pc // PC where to jump if a group need
1036 re.prog[pc].group_id = re.prog[goto_pc].group_id // id of this group, used for storing data
1037
1038 re.prog[goto_pc].goto_pc = pc // start goto point to the end group pc
1039 // re.prog[goto_pc].group_id = group_count // id of this group, used for storing data
1040
1041 // duplicate the negation group info and settings
1042 if re.prog[goto_pc].group_neg == true {
1043 re.prog[pc].group_neg = re.prog[goto_pc].group_neg
1044 re.prog[pc].rep_min = re.prog[goto_pc].rep_min
1045 }
1046
1047 pc = pc + 1
1048 i = i + char_len
1049 continue
1050 }
1051
1052 // ist_dot_char match any char except the following token
1053 if char_len == 1 && pc >= 0 && u8(char_tmp) == `.` {
1054 // consecutive ist_dot_char is a syntax error
1055 if pc > 0 && re.prog[pc - 1].ist == regex.ist_dot_char {
1056 return regex.err_consecutive_dots, i
1057 }
1058
1059 re.prog[pc].ist = u32(0) | regex.ist_dot_char
1060 re.prog[pc].rep_min = 1
1061 re.prog[pc].rep_max = 1
1062 pc = pc + 1
1063 i = i + char_len
1064 continue
1065 }
1066
1067 // OR branch
1068 if char_len == 1 && pc > 0 && u8(char_tmp) == `|` {
1069 if pc > 0 && re.prog[pc - 1].ist == regex.ist_or_branch {
1070 return regex.err_syntax_error, i
1071 }
1072 re.prog[pc].ist = u32(0) | regex.ist_or_branch
1073 re.prog[pc].source_index = i
1074 pc = pc + 1
1075 i = i + char_len
1076 continue
1077 }
1078
1079 // Quantifiers
1080 if char_len == 1 && pc > 0 {
1081 mut char_next := rune(0)
1082 mut char_next_len := 0
1083 if (char_len + i) < in_txt.len {
1084 char_next, char_next_len = re.get_char(in_txt, i + char_len)
1085 }
1086 mut quant_flag := true
1087
1088 // negation groups can not have quantifiers
1089 if re.prog[pc - 1].group_neg == true && char_tmp in [`?`, `+`, `*`, `{`] {
1090 return regex.err_neg_group_quantifier, i
1091 }
1092
1093 match u8(char_tmp) {
1094 `?` {
1095 // println("q: ${char_tmp:c}")
1096 // check illegal quantifier sequences
1097 if char_next_len == 1 && char_next in regex.quntifier_chars {
1098 return regex.err_syntax_error, i
1099 }
1100 re.prog[pc - 1].rep_min = 0
1101 re.prog[pc - 1].rep_max = 1
1102 }
1103 `+` {
1104 // println("q: ${char_tmp:c}")
1105 // check illegal quantifier sequences
1106 if char_next_len == 1 && char_next in regex.quntifier_chars {
1107 return regex.err_syntax_error, i
1108 }
1109 re.prog[pc - 1].rep_min = 1
1110 re.prog[pc - 1].rep_max = regex.max_quantifier
1111 }
1112 `*` {
1113 // println("q: ${char_tmp:c}")
1114 // check illegal quantifier sequences
1115 if char_next_len == 1 && char_next in regex.quntifier_chars {
1116 return regex.err_syntax_error, i
1117 }
1118 re.prog[pc - 1].rep_min = 0
1119 re.prog[pc - 1].rep_max = regex.max_quantifier
1120 }
1121 `{` {
1122 min, max, tmp, greedy := re.parse_quantifier(in_txt, i + 1)
1123 // it is a quantifier
1124 if min >= 0 {
1125 // println("{$min,$max}\n str:[${in_txt[i..i+tmp]}] greedy:$greedy")
1126 i = i + tmp
1127 re.prog[pc - 1].rep_min = min
1128 re.prog[pc - 1].rep_max = max
1129 re.prog[pc - 1].greedy = greedy
1130 // check illegal quantifier sequences
1131 if i <= in_txt.len {
1132 char_next, char_next_len = re.get_char(in_txt, i)
1133 if char_next_len == 1 && char_next in regex.quntifier_chars {
1134 return regex.err_syntax_error, i
1135 }
1136 }
1137 continue
1138 } else {
1139 return min, i
1140 }
1141
1142 // TODO: decide if the open bracket can be conform without the close bracket
1143 /*
1144 // no conform, parse as normal char
1145 else {
1146 quant_flag = false
1147 }
1148 */
1149 }
1150 else {
1151 quant_flag = false
1152 }
1153 }
1154
1155 if quant_flag {
1156 i = i + char_len
1157 continue
1158 }
1159 }
1160
1161 // IST_CHAR_CLASS_*
1162 if char_len == 1 && pc >= 0 {
1163 if u8(char_tmp) == `[` {
1164 cc_index, tmp, cc_type := re.parse_char_class(in_txt, i + 1)
1165 if cc_index >= 0 {
1166 // println("index: $cc_index str:${in_txt[i..i+tmp]}")
1167 i = i + tmp
1168 re.prog[pc].ist = u32(0) | cc_type
1169 re.prog[pc].cc_index = cc_index
1170 re.prog[pc].rep_min = 1
1171 re.prog[pc].rep_max = 1
1172 pc = pc + 1
1173 continue
1174 }
1175 // cc_class vector memory full
1176 else if cc_index < 0 {
1177 return cc_index, i
1178 }
1179 }
1180 }
1181
1182 // ist_bsls_char
1183 if char_len == 1 && pc >= 0 {
1184 if u8(char_tmp) == `\\` {
1185 bsls_index, tmp := re.parse_bsls(in_txt, i)
1186 // println("index: $bsls_index str:${in_txt[i..i+tmp]}")
1187 if bsls_index >= 0 {
1188 i = i + tmp
1189 re.prog[pc].ist = u32(0) | regex.ist_bsls_char
1190 re.prog[pc].rep_min = 1
1191 re.prog[pc].rep_max = 1
1192 re.prog[pc].validator = regex.bsls_validator_array[bsls_index].validator
1193 re.prog[pc].ch = regex.bsls_validator_array[bsls_index].ch
1194 pc = pc + 1
1195 continue
1196 }
1197 // this is an escape char, skip the bsls and continue as a normal char
1198 else if bsls_index == regex.no_match_found {
1199 i += char_len
1200 char_tmp, char_len = re.get_char(in_txt, i)
1201 // continue as simple char
1202 }
1203 // if not an escape or a bsls char then it is an error (at least for now!)
1204 else {
1205 return bsls_index, i + tmp
1206 }
1207 }
1208 }
1209
1210 // ist_simple_char
1211 re.prog[pc].ist = regex.ist_simple_char
1212 re.prog[pc].ch = char_tmp
1213 re.prog[pc].ch_len = u8(char_len)
1214 re.prog[pc].rep_min = 1
1215 re.prog[pc].rep_max = 1
1216 // println("char: ${char_tmp:c}")
1217 pc = pc + 1
1218
1219 i += char_len
1220 }
1221
1222 // add end of the program
1223 re.prog[pc].ist = regex.ist_prog_end
1224 re.prog_len = pc
1225
1226 // check for unbalanced groups
1227 if group_stack_index != -1 {
1228 return regex.err_group_not_balanced, group_stack_txt_index[group_stack_index] + 1
1229 }
1230
1231 // check for OR at the end of the program
1232 if pc > 0 && re.prog[pc - 1].ist == regex.ist_or_branch {
1233 return regex.err_syntax_error, in_txt.len - 1
1234 }
1235
1236 // store the number of groups in the query
1237 re.group_count = group_count + 1
1238
1239 //******************************************
1240 // Post processing
1241 //******************************************
1242
1243 //
1244 // manage ist_dot_char
1245 //
1246
1247 // find the checks for dot chars, if any...
1248 mut pc1 := 0
1249 mut dot_char_count := 0
1250 mut last_dot_char_pc := -1
1251 for pc1 < pc {
1252 if re.prog[pc1].ist == regex.ist_dot_char {
1253 // println("Dot_char pc: $pc1")
1254 last_dot_char_pc = pc1
1255 dot_char_count++
1256 mut pc2 := pc1 + 1
1257 for pc2 < pc {
1258 // consecutive dot chars is an error
1259 if re.prog[pc2].ist == regex.ist_dot_char {
1260 return regex.err_syntax_error, 0
1261 }
1262 if re.prog[pc2].ist !in [rune(regex.ist_prog_end), regex.ist_group_end,
1263 regex.ist_group_start] {
1264 // println("Next dot char check is PC: ${pc2}")
1265 re.prog[pc1].dot_check_pc = pc2
1266 break
1267 }
1268 pc2++
1269 }
1270 }
1271 pc1++
1272 }
1273
1274 // println("last_dot_char_pc: ${last_dot_char_pc}")
1275 if last_dot_char_pc >= 0 {
1276 pc1 = last_dot_char_pc + 1
1277 mut is_last_dot := true
1278 for pc1 < pc {
1279 if re.prog[pc1].ist !in [rune(regex.ist_prog_end), regex.ist_group_end] {
1280 is_last_dot = false
1281 break
1282 }
1283 pc1++
1284 }
1285 if is_last_dot {
1286 re.prog[last_dot_char_pc].last_dot_flag = true
1287 }
1288 }
1289
1290 //
1291 // manage bsls_char
1292 //
1293
1294 // find the checks for bsls, if any...
1295 pc1 = 0
1296 mut bsls_char_count := 0
1297 mut last_bsls_char_pc := -1
1298 for pc1 < pc {
1299 if re.prog[pc1].ist == regex.ist_bsls_char {
1300 // println("bsls_char pc: $pc1")
1301 last_bsls_char_pc = pc1
1302 bsls_char_count++
1303 mut pc2 := pc1 + 1
1304 for pc2 < pc {
1305 if re.prog[pc2].ist !in [rune(regex.ist_prog_end), regex.ist_group_end,
1306 regex.ist_group_start] {
1307 // println("Next bsls check is PC: ${pc2}")
1308 re.prog[pc1].bsls_check_pc = pc2
1309 break
1310 }
1311 pc2++
1312 }
1313 }
1314 pc1++
1315 }
1316
1317 // println('last_bsls_char_pc: ${last_bsls_char_pc}')
1318 if last_bsls_char_pc >= 0 {
1319 pc1 = last_bsls_char_pc + 1
1320 mut is_last_bsls := true
1321 for pc1 < pc {
1322 if re.prog[pc1].ist !in [rune(regex.ist_prog_end), regex.ist_group_end] {
1323 is_last_bsls = false
1324 break
1325 }
1326 pc1++
1327 }
1328 if is_last_bsls {
1329 re.prog[last_bsls_char_pc].last_dot_flag = true
1330 }
1331 }
1332
1333 //
1334 // manage CC
1335 //
1336 pc1 = 0
1337 mut cc_char_count := 0
1338 mut last_cc_char_pc := -1
1339 for pc1 < pc {
1340 if re.prog[pc1].ist in [rune(regex.ist_char_class_pos), regex.ist_char_class_neg] {
1341 last_cc_char_pc = pc1
1342 cc_char_count++
1343 mut pc2 := pc1 + 1
1344 for pc2 < pc {
1345 if re.prog[pc2].ist !in [rune(regex.ist_prog_end), regex.ist_group_end,
1346 regex.ist_group_start] {
1347 // println("Next CC check is PC: ${pc2}")
1348 re.prog[pc1].cc_check_pc = pc2
1349 break
1350 }
1351 pc2++
1352 }
1353 }
1354 pc1++
1355 }
1356
1357 // println('last_cc_char_pc: ${last_cc_char_pc}')
1358 if last_cc_char_pc >= 0 {
1359 pc1 = last_cc_char_pc + 1
1360 mut is_last_cc := true
1361 for pc1 < pc {
1362 if re.prog[pc1].ist !in [rune(regex.ist_prog_end), regex.ist_group_end] {
1363 is_last_cc = false
1364 break
1365 }
1366 pc1++
1367 }
1368 if is_last_cc {
1369 re.prog[last_cc_char_pc].last_dot_flag = true
1370 }
1371 }
1372
1373 //******************************************
1374
1375 // OR branch
1376 // a|b|cd
1377 // d exit point
1378 // a,b,c branches
1379 // set the jump in the right places
1380 pc1 = 0
1381 for pc1 < pc - 2 {
1382 // println("Here $pc1 ${pc-2}")
1383 // println("source index: ${pc1 + 1} => ${re.prog[pc1+1].source_index}")
1384 if re.prog[pc1 + 1].ist == regex.ist_or_branch {
1385 // two consecutive OR are a syntax error
1386 if re.prog[pc1 + 2].ist == regex.ist_or_branch {
1387 return regex.err_syntax_error, i
1388 }
1389
1390 // check for []|[] errors
1391 if re.prog[pc1].ist == regex.ist_char_class_pos
1392 && re.prog[pc1 + 2].ist == regex.ist_char_class_pos {
1393 return regex.err_invalid_or_with_cc, re.prog[pc1 + 1].source_index
1394 }
1395 }
1396
1397 // manange a|b chains like a|(b)|c|d...
1398 // standard solution
1399 if re.prog[pc1].ist != regex.ist_or_branch && re.prog[pc1 + 1].ist == regex.ist_or_branch
1400 && re.prog[pc1 + 2].ist != regex.ist_or_branch {
1401 re.prog[pc1].next_is_or = true // set that the next token is an OR
1402 re.prog[pc1 + 1].rep_min = pc1 + 2 // failed match jump
1403
1404 // match jump, if an OR chain the next token will be an OR token
1405 mut pc2 := pc1 + 2
1406 for pc2 < pc - 1 {
1407 ist := re.prog[pc2].ist
1408 if ist == regex.ist_group_start {
1409 re.prog[pc1 + 1].rep_max = re.prog[pc2].goto_pc + 1
1410 break
1411 }
1412 if ist != regex.ist_or_branch {
1413 re.prog[pc1 + 1].rep_max = pc2 + 1
1414 break
1415 }
1416
1417 pc2++
1418 }
1419 // special case query of few chars, the true can't go on the first instruction
1420 if re.prog[pc1 + 1].rep_max == pc1 {
1421 re.prog[pc1 + 1].rep_max = 3
1422 }
1423 // println("Compile OR postproc. [$pc1,OR ${pc1+1},$pc2]")
1424 pc1 = pc2
1425 continue
1426 }
1427
1428 pc1++
1429 }
1430
1431 //******************************************
1432 // DEBUG PRINT REGEX GENERATED CODE
1433 //******************************************
1434 if re.debug > 0 {
1435 gc := re.get_code()
1436 re.log_func(gc)
1437 }
1438 //******************************************
1439
1440 return regex.compile_ok, 0
1441}
1442
1443// get_code return the compiled code as regex string, note: may be different from the source!
1444pub fn (re RE) get_code() string {
1445 mut pc1 := 0
1446 mut res := strings.new_builder(re.cc.len * 2 * re.prog.len)
1447 res.write_string('========================================\nv RegEx compiler v ${regex.v_regex_version} output:\n')
1448
1449 mut stop_flag := false
1450
1451 for pc1 <= re.prog.len {
1452 tk := re.prog[pc1]
1453 res.write_string('PC:${pc1:3d}')
1454
1455 res.write_string(' ist: ')
1456 res.write_string('${tk.ist:8x}'.replace(' ', '0'))
1457 res.write_string(' ')
1458 ist := tk.ist
1459 if ist == regex.ist_bsls_char {
1460 res.write_string('[\\${tk.ch:1c}] BSLS')
1461 if tk.last_dot_flag == true {
1462 res.write_string(' last!')
1463 }
1464 } else if ist == regex.ist_prog_end {
1465 res.write_string('PROG_END')
1466 stop_flag = true
1467 } else if ist == regex.ist_or_branch {
1468 res.write_string('OR ')
1469 } else if ist == regex.ist_char_class_pos {
1470 res.write_string('[${re.get_char_class(pc1)}] CHAR_CLASS_POS')
1471 if tk.last_dot_flag == true {
1472 res.write_string(' last!')
1473 }
1474 } else if ist == regex.ist_char_class_neg {
1475 res.write_string('[^${re.get_char_class(pc1)}] CHAR_CLASS_NEG')
1476 if tk.last_dot_flag == true {
1477 res.write_string(' last!')
1478 }
1479 } else if ist == regex.ist_dot_char {
1480 res.write_string('. DOT_CHAR nx chk: ${tk.dot_check_pc}')
1481 if tk.last_dot_flag == true {
1482 res.write_string(' last!')
1483 }
1484 } else if ist == regex.ist_group_start {
1485 res.write_string('( GROUP_START #:${tk.group_id}')
1486 if tk.group_id == -1 {
1487 res.write_string(' ?:')
1488 } else {
1489 for x in re.group_map.keys() {
1490 if re.group_map[x] == (tk.group_id + 1) {
1491 res.write_string(' ?P<${x}>')
1492 break
1493 }
1494 }
1495 }
1496 } else if ist == regex.ist_group_end {
1497 res.write_string(') GROUP_END #:${tk.group_id}')
1498 } else if ist == regex.ist_simple_char {
1499 res.write_string('[${tk.ch:1c}] query_ch')
1500 }
1501
1502 if tk.rep_max == regex.max_quantifier {
1503 res.write_string(' {${tk.rep_min:3d},MAX}')
1504 } else {
1505 if ist == regex.ist_or_branch {
1506 res.write_string(' if false go: ${tk.rep_min:3d} if true go: ${tk.rep_max:3d}')
1507 } else {
1508 res.write_string(' {${tk.rep_min:3d},${tk.rep_max:3d}}')
1509 }
1510 if tk.greedy == true {
1511 res.write_string('?')
1512 }
1513 }
1514
1515 res.write_string('\n')
1516 if stop_flag {
1517 break
1518 }
1519 pc1++
1520 }
1521
1522 res.write_string('========================================\n')
1523 return res.str()
1524}
1525
1526// get_query return a string with a reconstruction of the query starting from the regex program code
1527pub fn (re RE) get_query() string {
1528 mut res := strings.new_builder(re.query.len * 2)
1529
1530 if (re.flag & regex.f_ms) != 0 {
1531 res.write_string('^')
1532 }
1533
1534 mut i := 0
1535 for i < re.prog.len && re.prog[i].ist != regex.ist_prog_end && re.prog[i].ist != 0 {
1536 tk := unsafe { &re.prog[i] }
1537 ch := tk.ist
1538
1539 // GROUP start
1540 if ch == regex.ist_group_start {
1541 if re.debug > 0 {
1542 res.write_string('#${tk.group_id}')
1543 }
1544 res.write_string('(')
1545
1546 if tk.group_neg == true {
1547 res.write_string('?!') // negation group
1548 } else if tk.group_id == -1 {
1549 res.write_string('?:') // non capturing group
1550 }
1551
1552 for x in re.group_map.keys() {
1553 if re.group_map[x] == (tk.group_id + 1) {
1554 res.write_string('?P<${x}>')
1555 break
1556 }
1557 }
1558
1559 i++
1560 continue
1561 }
1562
1563 // GROUP end
1564 if ch == regex.ist_group_end {
1565 res.write_string(')')
1566 }
1567
1568 // OR branch
1569 if ch == regex.ist_or_branch {
1570 res.write_string('|')
1571 if re.debug > 0 {
1572 res.write_string('{${tk.rep_min},${tk.rep_max}}')
1573 }
1574 i++
1575 continue
1576 }
1577
1578 // char class
1579 if ch == regex.ist_char_class_neg || ch == regex.ist_char_class_pos {
1580 res.write_string('[')
1581 if ch == regex.ist_char_class_neg {
1582 res.write_string('^')
1583 }
1584 res.write_string('${re.get_char_class(i)}')
1585 res.write_string(']')
1586 }
1587
1588 // bsls char
1589 if ch == regex.ist_bsls_char {
1590 res.write_string('\\${tk.ch:1c}')
1591 }
1592
1593 // ist_dot_char
1594 if ch == regex.ist_dot_char {
1595 res.write_string('.')
1596 }
1597
1598 // char alone
1599 if ch == regex.ist_simple_char {
1600 if u8(ch) in regex.bsls_escape_list {
1601 res.write_string('\\')
1602 }
1603 res.write_string('${tk.ch:c}')
1604 }
1605
1606 // quantifier
1607 if !(tk.rep_min == 1 && tk.rep_max == 1) && tk.group_neg == false {
1608 if tk.rep_min == 0 && tk.rep_max == 1 {
1609 res.write_string('?')
1610 } else if tk.rep_min == 1 && tk.rep_max == regex.max_quantifier {
1611 res.write_string('+')
1612 } else if tk.rep_min == 0 && tk.rep_max == regex.max_quantifier {
1613 res.write_string('*')
1614 } else {
1615 if tk.rep_max == regex.max_quantifier {
1616 res.write_string('{${tk.rep_min},MAX}')
1617 } else {
1618 res.write_string('{${tk.rep_min},${tk.rep_max}}')
1619 }
1620 if tk.greedy == true {
1621 res.write_string('?')
1622 }
1623 }
1624 }
1625 i++
1626 }
1627 if (re.flag & regex.f_me) != 0 {
1628 res.write_string('$')
1629 }
1630
1631 return res.str()
1632}
1633
1634/******************************************************************************
1635*
1636* Groups saving utilities
1637*
1638******************************************************************************/
1639[direct_array_access]
1640fn (mut re RE) group_continuous_save(g_index int) {
1641 if re.group_csave_flag == true {
1642 // continuous save, save until we have space
1643
1644 // init the first element as counter
1645 if re.group_csave.len == 0 {
1646 re.group_csave << 0
1647 }
1648
1649 gi := g_index >> 1
1650 start := re.groups[g_index]
1651 end := re.groups[g_index + 1]
1652
1653 // check if we are simply increasing the size ot the found group
1654 if re.group_csave.len >= 4 && gi == re.group_csave[re.group_csave.len - 3]
1655 && start == re.group_csave[re.group_csave.len - 2] {
1656 re.group_csave[re.group_csave.len - 1] = end
1657 return
1658 }
1659
1660 // otherwise append a new group to the list
1661
1662 // increment counter
1663 re.group_csave[0]++
1664 // save the record
1665 re.group_csave << (g_index >> 1) // group id
1666 re.group_csave << re.groups[g_index] // start
1667 re.group_csave << re.groups[g_index + 1] // end
1668 }
1669}
1670
1671/******************************************************************************
1672*
1673* Matching
1674*
1675******************************************************************************/
1676enum Match_state {
1677 start = 0
1678 stop
1679 end
1680 new_line
1681 ist_load // load and execute instruction
1682 ist_next // go to next instruction
1683 ist_next_ks // go to next instruction without clenaning the state
1684 ist_quant_p // match positive ,quantifier check
1685 ist_quant_n // match negative, quantifier check
1686 ist_quant_pg // match positive ,group quantifier check
1687 ist_quant_ng // match negative ,group quantifier check
1688}
1689
1690fn state_str(s Match_state) string {
1691 match s {
1692 .start { return 'start' }
1693 .stop { return 'stop' }
1694 .end { return 'end' }
1695 .new_line { return 'new line' }
1696 .ist_load { return 'ist_load' }
1697 .ist_next { return 'ist_next' }
1698 .ist_next_ks { return 'ist_next_ks' }
1699 .ist_quant_p { return 'ist_quant_p' }
1700 .ist_quant_n { return 'ist_quant_n' }
1701 .ist_quant_pg { return 'ist_quant_pg' }
1702 .ist_quant_ng { return 'ist_quant_ng' }
1703 }
1704}
1705
1706struct StateObj {
1707pub mut:
1708 group_index int = -1 // group id used to know how many groups are open
1709 match_flag bool // indicate if we are in a match condition
1710 match_index int = -1 // index of the last match
1711 first_match int = -1 // index of the first match
1712 pc int = -1 // program counter
1713 i int = -1 // source string index
1714 char_len int // last char legth
1715 last_dot_pc int = -1 // last dot chat pc
1716}
1717
1718[direct_array_access]
1719pub fn (mut re RE) match_base(in_txt &u8, in_txt_len int) (int, int) {
1720 // result status
1721 mut result := regex.no_match_found // function return
1722
1723 mut ch := rune(0) // examinated char
1724 mut char_len := 0 // utf8 examinated char len
1725 mut m_state := Match_state.start // start point for the matcher FSM
1726 mut src_end := false
1727 mut last_fnd_pc := -1
1728
1729 mut state := StateObj{} // actual state
1730 mut ist := rune(0) // actual instruction
1731 mut l_ist := rune(0) // last matched instruction
1732
1733 mut step_count := 0 // stats for debug
1734 mut dbg_line := 0 // count debug line printed
1735
1736 re.reset()
1737
1738 if re.debug > 0 {
1739 // print header
1740 mut h_buf := strings.new_builder(32)
1741 h_buf.write_string('flags: ')
1742 h_buf.write_string('${re.flag:8x}'.replace(' ', '0'))
1743 h_buf.write_string('\n')
1744 sss := h_buf.str()
1745 re.log_func(sss)
1746 }
1747
1748 for m_state != .end {
1749 if state.pc >= 0 && state.pc < re.prog.len {
1750 ist = re.prog[state.pc].ist
1751 } else if state.pc >= re.prog.len {
1752 // println("ERROR!! PC overflow!!")
1753 return regex.err_internal_error, state.i
1754 }
1755
1756 //******************************************
1757 // DEBUG LOG
1758 //******************************************
1759 if re.debug > 0 {
1760 mut buf2 := strings.new_builder(re.cc.len + 128)
1761
1762 // print all the instructions
1763
1764 // end of the input text
1765 if state.i >= in_txt_len {
1766 buf2.write_string('# ${step_count:3d} END OF INPUT TEXT\n')
1767 sss := buf2.str()
1768 re.log_func(sss)
1769 } else {
1770 // print only the exe instruction
1771 if (re.debug == 1 && m_state == .ist_load) || re.debug == 2 {
1772 if ist == regex.ist_prog_end {
1773 buf2.write_string('# ${step_count:3d} PROG_END\n')
1774 } else if ist == 0 || m_state in [.start, .ist_next, .stop] {
1775 buf2.write_string('# ${step_count:3d} s: ${state_str(m_state):12s} PC: NA\n')
1776 } else {
1777 ch, char_len = re.get_charb(in_txt, state.i)
1778
1779 buf2.write_string('# ${step_count:3d} s: ${state_str(m_state):12s} PC: ${state.pc:3d}=>')
1780 buf2.write_string('${ist:8x}'.replace(' ', '0'))
1781 buf2.write_string(" i,ch,len:[${state.i:3d},'${utf8_str(ch)}',${char_len}] f.m:[${state.first_match:3d},${state.match_index:3d}] ")
1782
1783 if ist == regex.ist_simple_char {
1784 buf2.write_string('query_ch: [${re.prog[state.pc].ch:1c}]')
1785 } else {
1786 if ist == regex.ist_bsls_char {
1787 buf2.write_string('BSLS [\\${re.prog[state.pc].ch:1c}]')
1788 } else if ist == regex.ist_prog_end {
1789 buf2.write_string('PROG_END')
1790 } else if ist == regex.ist_or_branch {
1791 buf2.write_string('OR')
1792 } else if ist == regex.ist_char_class_pos {
1793 buf2.write_string('CHAR_CLASS_POS[${re.get_char_class(state.pc)}]')
1794 } else if ist == regex.ist_char_class_neg {
1795 buf2.write_string('CHAR_CLASS_NEG[${re.get_char_class(state.pc)}]')
1796 } else if ist == regex.ist_dot_char {
1797 buf2.write_string('DOT_CHAR')
1798 } else if ist == regex.ist_group_start {
1799 tmp_gi := re.prog[state.pc].group_id
1800 tmp_gr := re.prog[re.prog[state.pc].goto_pc].group_rep
1801 buf2.write_string('GROUP_START #:${tmp_gi} rep:${tmp_gr} ')
1802 } else if ist == regex.ist_group_end {
1803 buf2.write_string('GROUP_END #:${re.prog[state.pc].group_id} deep:${state.group_index}')
1804 }
1805 }
1806 if re.prog[state.pc].rep_max == regex.max_quantifier {
1807 buf2.write_string('{${re.prog[state.pc].rep_min},MAX}:${re.prog[state.pc].rep}')
1808 } else {
1809 buf2.write_string('{${re.prog[state.pc].rep_min},${re.prog[state.pc].rep_max}}:${re.prog[state.pc].rep}')
1810 }
1811 if re.prog[state.pc].greedy == true {
1812 buf2.write_string('?')
1813 }
1814 buf2.write_string(' (#${state.group_index})')
1815
1816 if ist == regex.ist_dot_char {
1817 buf2.write_string(' last!')
1818 }
1819
1820 buf2.write_string('\n')
1821 }
1822 sss2 := buf2.str()
1823 re.log_func(sss2)
1824 }
1825 }
1826 step_count++
1827 dbg_line++
1828 }
1829 //******************************************
1830
1831 if ist == regex.ist_prog_end {
1832 // println("HERE we end!")
1833 break
1834 }
1835
1836 // we're out of text, manage it
1837 if state.i >= in_txt_len || m_state == .new_line {
1838 // println("Finished text!!")
1839 src_end = true
1840
1841 // we have fished the text, we must manage out pf bound indexes
1842 if state.i >= in_txt_len {
1843 state.i = in_txt_len - 1
1844 }
1845
1846 // manage groups
1847 if state.group_index >= 0 && state.match_index >= 0 {
1848 // println("End text with open groups!")
1849 // println("state.group_index: ${state.group_index}")
1850 // close the groups
1851 for state.group_index >= 0 {
1852 tmp_pc := re.group_data[state.group_index]
1853 re.prog[tmp_pc].group_rep++
1854 // println("Closing group $state.group_index {${re.prog[tmp_pc].rep_min},${re.prog[tmp_pc].rep_max}}:${re.prog[tmp_pc].group_rep}")
1855
1856 if re.prog[tmp_pc].group_rep >= re.prog[tmp_pc].rep_min
1857 && re.prog[tmp_pc].group_id >= 0 {
1858 start_i := re.group_stack[state.group_index]
1859 re.group_stack[state.group_index] = -1
1860
1861 // save group results
1862 g_index := re.prog[tmp_pc].group_id * 2
1863 // println("group_id: ${re.prog[tmp_pc].group_id} g_index: ${g_index}")
1864 if start_i >= 0 {
1865 re.groups[g_index] = start_i
1866 } else {
1867 re.groups[g_index] = 0
1868 }
1869
1870 re.groups[g_index + 1] = state.i
1871
1872 if re.groups[g_index + 1] >= in_txt_len {
1873 // println("clamp group on stop!")
1874 re.groups[g_index + 1] = in_txt_len - 1
1875 }
1876
1877 // continuous save, save until we have space
1878 re.group_continuous_save(g_index)
1879 }
1880 state.group_index--
1881 }
1882 }
1883
1884 // println("re.groups: ${re.groups}")
1885
1886 // the text is finished and the groups closed and we are the last group, ok exit
1887 if ist == regex.ist_group_end && re.prog[state.pc + 1].ist == regex.ist_prog_end {
1888 // println("Last group end")
1889 return state.first_match, state.i
1890 }
1891
1892 if state.pc == -1 {
1893 state.pc = last_fnd_pc
1894 }
1895
1896 // println("Finished text!!")
1897 // println("Instruction: ${ist:08x} pc: $state.pc")
1898 // println("min_rep: ${re.prog[state.pc].rep_min} max_rep: ${re.prog[state.pc].rep_max} rep: ${re.prog[state.pc].rep}")
1899
1900 // program end
1901 if ist == regex.ist_prog_end {
1902 // println("Program end on end of text!")
1903 return state.first_match, state.i
1904 }
1905
1906 if l_ist in [
1907 rune(regex.ist_char_class_neg),
1908 regex.ist_char_class_pos,
1909 regex.ist_bsls_char,
1910 regex.ist_dot_char,
1911 ] {
1912 // println("***** We have a last special token")
1913 // println("PC: ${state.pc} last_dot_flag:${re.prog[state.pc].last_dot_flag}")
1914 // println("rep: ${re.prog[state.pc].group_rep} min: ${re.prog[state.pc].rep_min} max: ${re.prog[state.pc].rep_max}")
1915 // println("first match: ${state.first_match}")
1916
1917 if re.prog[state.pc].last_dot_flag == true
1918 && re.prog[state.pc].rep >= re.prog[state.pc].rep_min
1919 && re.prog[state.pc].rep <= re.prog[state.pc].rep_max {
1920 return state.first_match, state.i
1921 }
1922 // println("Not fitted!!")
1923 }
1924 // no groups open, check the last token quantifier
1925 if ist != regex.ist_group_end && re.prog[state.pc + 1].ist == regex.ist_prog_end {
1926 if re.prog[state.pc].rep >= re.prog[state.pc].rep_min
1927 && re.prog[state.pc].rep <= re.prog[state.pc].rep_max {
1928 // println("We are in good repetition")
1929 return state.first_match, state.i
1930 }
1931 }
1932
1933 // println("No good exit!!")
1934 if re.prog[re.prog_len - 1].ist == regex.ist_group_end {
1935 // println("last ist is a group end!")
1936 if re.prog[re.prog_len - 1].group_rep >= re.prog[re.prog_len - 1].rep_min {
1937 return state.first_match, state.i
1938 }
1939 }
1940 return regex.no_match_found, state.i
1941 }
1942
1943 // starting and init
1944 if m_state == .start {
1945 state.pc = -1
1946 state.i = 0
1947 m_state = .ist_next
1948 continue
1949 }
1950 // ist_next, next instruction reseting its state
1951 else if m_state == .ist_next {
1952 state.pc = state.pc + 1
1953 re.prog[state.pc].reset()
1954 // check if we are in the program bounds
1955 if state.pc < 0 || state.pc > re.prog.len {
1956 // println("ERROR!! PC overflow!!")
1957 return regex.err_internal_error, state.i
1958 }
1959 m_state = .ist_load
1960 continue
1961 }
1962 // ist_next_ks, next instruction keeping its state
1963 else if m_state == .ist_next_ks {
1964 state.pc = state.pc + 1
1965 // check if we are in the program bounds
1966 if state.pc < 0 || state.pc > re.prog.len {
1967 // println("ERROR!! PC overflow!!")
1968 return regex.err_internal_error, state.i
1969 }
1970 m_state = .ist_load
1971 continue
1972 }
1973
1974 // load the char
1975 ch, char_len = re.get_charb(in_txt, state.i)
1976
1977 // check new line if flag f_nl enabled
1978 if (re.flag & regex.f_nl) != 0 && char_len == 1 && u8(ch) in regex.new_line_list {
1979 m_state = .new_line
1980 continue
1981 }
1982 // check if stop
1983 else if m_state == .stop {
1984 // we are in search mode, don't exit until the end
1985 if (re.flag & regex.f_src) != 0 && ist != regex.ist_prog_end {
1986 last_fnd_pc = state.pc
1987 state.pc = -1
1988 state.i += char_len
1989
1990 m_state = .ist_next
1991 re.reset_src()
1992 state.match_index = -1
1993 state.first_match = -1
1994
1995 // reset state list
1996 re.reset()
1997
1998 continue
1999 }
2000
2001 if ist == regex.ist_prog_end {
2002 return state.first_match, state.i
2003 }
2004
2005 // manage here dot char
2006
2007 if re.state_list.len > 0 {
2008 // println("Here we are, with stop: state buffer: [${re.state_list.len}]")
2009 state = re.state_list.pop()
2010
2011 state.match_flag = true
2012 l_ist = u32(regex.ist_dot_char)
2013
2014 if state.first_match < 0 {
2015 state.first_match = state.i
2016 }
2017 state.match_index = state.i
2018 re.prog[state.pc].rep++ // increase repetitions
2019
2020 state.i += char_len
2021 m_state = .ist_quant_p
2022 continue
2023 }
2024
2025 // exit on no match
2026 return result, state.i
2027 }
2028 // ist_load
2029 else if m_state == .ist_load {
2030 // program end
2031 if ist == regex.ist_prog_end {
2032 // if we are in match exit well
2033
2034 if state.group_index >= 0 && state.match_index >= 0 {
2035 state.group_index = -1
2036 }
2037
2038 m_state = .stop
2039 continue
2040 }
2041 // check GROUP start, no quantifier is checkd for this token!!
2042 else if ist == regex.ist_group_start {
2043 state.group_index++
2044 re.group_data[state.group_index] = re.prog[state.pc].goto_pc // save where is ist_group_end, we will use it for escape
2045 re.group_stack[state.group_index] = state.i // index where we start to manage
2046 // println("group_index $state.group_index rep ${re.prog[re.prog[state.pc].goto_pc].group_rep}")
2047
2048 m_state = .ist_next
2049 continue
2050 }
2051 // check GROUP end
2052 else if ist == regex.ist_group_end {
2053 // we are in matching streak
2054 // println("Group END!! last ist: ${l_ist:08x}")
2055 if state.match_index >= 0 {
2056 // restore txt index stack and save the group data
2057
2058 // println("g.id: ${re.prog[state.pc].group_id} group_index: ${state.group_index}")
2059 if state.group_index >= 0 && re.prog[state.pc].group_id >= 0 {
2060 start_i := re.group_stack[state.group_index]
2061
2062 // save group results
2063 g_index := re.prog[state.pc].group_id * 2
2064
2065 if start_i >= 0 {
2066 re.groups[g_index] = start_i
2067 } else {
2068 re.groups[g_index] = 0
2069 }
2070
2071 re.groups[g_index + 1] = state.i
2072
2073 if g_index > 0 && re.groups[g_index] <= re.groups[g_index - 1] {
2074 re.groups[g_index] = re.groups[g_index - 1]
2075 }
2076
2077 if re.groups[g_index + 1] >= in_txt_len {
2078 // println("clamp group!")
2079 re.groups[g_index + 1] = in_txt_len - 1
2080 }
2081
2082 // println("GROUP ${re.prog[state.pc].group_id} END [${re.groups[g_index]}, ${re.groups[g_index+1]}] i: $state.i in_txt_len: $in_txt_len")
2083
2084 // continuous save, save until we have space
2085 re.group_continuous_save(g_index)
2086 }
2087
2088 re.prog[state.pc].group_rep++ // increase repetitions
2089 // println("GROUP $group_index END ${re.prog[state.pc].group_rep}")
2090 if re.prog[state.pc].group_rep > in_txt_len - 1 {
2091 m_state = .ist_quant_ng
2092 continue
2093 }
2094
2095 m_state = .ist_quant_pg
2096 continue
2097 }
2098
2099 m_state = .ist_quant_ng
2100 continue
2101 }
2102 // check OR
2103 else if ist == regex.ist_or_branch {
2104 if state.match_index >= 0 {
2105 state.pc = re.prog[state.pc].rep_max
2106 // println("ist_or_branch True pc: $state.pc")
2107 } else {
2108 state.pc = re.prog[state.pc].rep_min
2109 // println("ist_or_branch False pc: $state.pc")
2110 }
2111 re.prog[state.pc].reset()
2112 m_state = .ist_load
2113 continue
2114 }
2115 // check ist_dot_char
2116 else if ist == regex.ist_dot_char {
2117 // println("ist_dot_char rep: ${re.prog[state.pc].rep}")
2118
2119 // check next token to be false
2120 mut next_check_flag := false
2121
2122 // if we are done with max go on dot char are dedicated case!!
2123 if re.prog[state.pc].rep >= re.prog[state.pc].rep_max {
2124 re.state_list.pop()
2125 m_state = .ist_next
2126 continue
2127 }
2128
2129 if re.prog[state.pc].last_dot_flag == false && re.prog[state.pc].dot_check_pc >= 0
2130 && re.prog[state.pc].rep >= re.prog[state.pc].rep_min {
2131 // load the char
2132 // ch_t, _ := re.get_charb(in_txt, state.i+char_len)
2133 ch_t := ch
2134 chk_pc := re.prog[state.pc].dot_check_pc
2135
2136 // simple char
2137 if re.prog[chk_pc].ist == regex.ist_simple_char {
2138 if re.prog[chk_pc].ch == ch_t {
2139 next_check_flag = true
2140 }
2141 // println("Check [ist_simple_char] [${re.prog[chk_pc].ch}]==[${ch_t:c}] => $next_check_flag")
2142 }
2143 // char char_class
2144 else if re.prog[chk_pc].ist == regex.ist_char_class_pos
2145 || re.prog[chk_pc].ist == regex.ist_char_class_neg {
2146 mut cc_neg := false
2147 if re.prog[chk_pc].ist == regex.ist_char_class_neg {
2148 cc_neg = true
2149 }
2150 mut cc_res := re.check_char_class(chk_pc, ch_t)
2151
2152 if cc_neg {
2153 cc_res = !cc_res
2154 }
2155 next_check_flag = cc_res
2156 // println("Check [ist_char_class] => $next_check_flag")
2157 }
2158 // check bsls
2159 else if re.prog[chk_pc].ist == regex.ist_bsls_char {
2160 next_check_flag = re.prog[chk_pc].validator(u8(ch_t))
2161 // println("Check [ist_bsls_char] => $next_check_flag")
2162 }
2163 }
2164
2165 // check if we must continue or pass to the next IST
2166 if next_check_flag == true && re.prog[state.pc + 1].ist != regex.ist_prog_end {
2167 // println("save the state!!")
2168 mut dot_state := StateObj{
2169 group_index: state.group_index
2170 match_flag: state.match_flag
2171 match_index: state.match_index
2172 first_match: state.first_match
2173 pc: state.pc
2174 i: state.i + char_len
2175 char_len: char_len
2176 last_dot_pc: state.pc
2177 }
2178 // if we are mananging a .* stay on the same char on return
2179 if re.prog[state.pc].rep_min == 0 {
2180 dot_state.i -= char_len
2181 }
2182
2183 re.state_list << dot_state
2184
2185 m_state = .ist_quant_n
2186 // println("dot_char stack len: ${re.state_list.len}")
2187 continue
2188 }
2189
2190 state.match_flag = true
2191 l_ist = u32(regex.ist_dot_char)
2192
2193 if state.first_match < 0 {
2194 state.first_match = state.i
2195 }
2196 state.match_index = state.i
2197 re.prog[state.pc].rep++ // increase repetitions
2198
2199 state.i += char_len
2200 m_state = .ist_quant_p
2201 continue
2202 }
2203 // char class IST
2204 else if ist == regex.ist_char_class_pos || ist == regex.ist_char_class_neg {
2205 // check next token to be false
2206 mut next_check_flag := false
2207
2208 // if we are done with max go on dot char are dedicated case!!
2209 if re.prog[state.pc].rep >= re.prog[state.pc].rep_max {
2210 re.state_list.pop()
2211 m_state = .ist_next
2212 continue
2213 }
2214
2215 if re.prog[state.pc].last_dot_flag == false && re.prog[state.pc].cc_check_pc >= 0
2216 && re.prog[state.pc].rep >= re.prog[state.pc].rep_min {
2217 // load the char
2218 // ch_t, _ := re.get_charb(in_txt, state.i+char_len)
2219 ch_t := ch
2220 chk_pc := re.prog[state.pc].cc_check_pc
2221
2222 // simple char
2223 if re.prog[chk_pc].ist == regex.ist_simple_char {
2224 if re.prog[chk_pc].ch == ch_t {
2225 next_check_flag = true
2226 }
2227 // println("Check [ist_simple_char] [${re.prog[chk_pc].ch}]==[${ch_t:c}] => $next_check_flag")
2228 }
2229 // char char_class
2230 else if re.prog[chk_pc].ist == regex.ist_char_class_pos
2231 || re.prog[chk_pc].ist == regex.ist_char_class_neg {
2232 mut cc_neg := false
2233 if re.prog[chk_pc].ist == regex.ist_char_class_neg {
2234 cc_neg = true
2235 }
2236 mut cc_res := re.check_char_class(chk_pc, ch_t)
2237
2238 if cc_neg {
2239 cc_res = !cc_res
2240 }
2241 next_check_flag = cc_res
2242 // println("Check [ist_char_class] => $next_check_flag")
2243 }
2244 // check bsls
2245 else if re.prog[chk_pc].ist == regex.ist_bsls_char {
2246 next_check_flag = re.prog[chk_pc].validator(u8(ch_t))
2247 // println("Check [ist_bsls_char] => $next_check_flag")
2248 }
2249 }
2250
2251 // check if we must continue or pass to the next IST
2252 if next_check_flag == true && re.prog[state.pc + 1].ist != regex.ist_prog_end {
2253 // println("save the state!!")
2254 mut dot_state := StateObj{
2255 group_index: state.group_index
2256 match_flag: state.match_flag
2257 match_index: state.match_index
2258 first_match: state.first_match
2259 pc: state.pc
2260 i: state.i + char_len
2261 char_len: char_len
2262 last_dot_pc: state.pc
2263 }
2264 // if we are managing a \[something]* stay on the same char on return
2265 if re.prog[state.pc].rep_min == 0 {
2266 dot_state.i -= char_len
2267 }
2268
2269 re.state_list << dot_state
2270
2271 m_state = .ist_quant_n
2272 // println("dot_char stack len: ${re.state_list.len}")
2273 continue
2274 }
2275
2276 state.match_flag = false
2277 mut cc_neg := false
2278
2279 if ist == regex.ist_char_class_neg {
2280 cc_neg = true
2281 }
2282 mut cc_res := re.check_char_class(state.pc, ch)
2283
2284 if cc_neg {
2285 cc_res = !cc_res
2286 }
2287
2288 if cc_res {
2289 state.match_flag = true
2290 l_ist = u32(regex.ist_char_class_pos)
2291
2292 if state.first_match < 0 {
2293 state.first_match = state.i
2294 }
2295
2296 state.match_index = state.i
2297
2298 re.prog[state.pc].rep++ // increase repetitions
2299 state.i += char_len // next char
2300 m_state = .ist_quant_p
2301 continue
2302 }
2303 m_state = .ist_quant_n
2304 continue
2305 }
2306 // check bsls
2307 else if ist == regex.ist_bsls_char {
2308 // println("ist_bsls_char rep: ${re.prog[state.pc].rep}")
2309
2310 // check next token to be false
2311 mut next_check_flag := false
2312
2313 // if we are done with max go on dot char are dedicated case!!
2314 if re.prog[state.pc].rep >= re.prog[state.pc].rep_max {
2315 re.state_list.pop()
2316 m_state = .ist_next
2317 continue
2318 }
2319
2320 if re.prog[state.pc].last_dot_flag == false && re.prog[state.pc].bsls_check_pc >= 0
2321 && re.prog[state.pc].rep >= re.prog[state.pc].rep_min {
2322 // load the char
2323 // ch_t, _ := re.get_charb(in_txt, state.i+char_len)
2324 ch_t := ch
2325 chk_pc := re.prog[state.pc].bsls_check_pc
2326
2327 // simple char
2328 if re.prog[chk_pc].ist == regex.ist_simple_char {
2329 if re.prog[chk_pc].ch == ch_t {
2330 next_check_flag = true
2331 }
2332 // println("Check [ist_simple_char] [${re.prog[chk_pc].ch}]==[${ch_t:c}] => $next_check_flag")
2333 }
2334 // char char_class
2335 else if re.prog[chk_pc].ist == regex.ist_char_class_pos
2336 || re.prog[chk_pc].ist == regex.ist_char_class_neg {
2337 mut cc_neg := false
2338 if re.prog[chk_pc].ist == regex.ist_char_class_neg {
2339 cc_neg = true
2340 }
2341 mut cc_res := re.check_char_class(chk_pc, ch_t)
2342
2343 if cc_neg {
2344 cc_res = !cc_res
2345 }
2346 next_check_flag = cc_res
2347 // println("Check [ist_char_class] => $next_check_flag")
2348 }
2349 // check bsls
2350 else if re.prog[chk_pc].ist == regex.ist_bsls_char {
2351 next_check_flag = re.prog[chk_pc].validator(u8(ch_t))
2352 // println("Check [ist_bsls_char] => $next_check_flag")
2353 }
2354 }
2355
2356 // check if we must continue or pass to the next IST
2357 if next_check_flag == true && re.prog[state.pc + 1].ist != regex.ist_prog_end {
2358 // println("save the state!!")
2359 mut dot_state := StateObj{
2360 group_index: state.group_index
2361 match_flag: state.match_flag
2362 match_index: state.match_index
2363 first_match: state.first_match
2364 pc: state.pc
2365 i: state.i + char_len
2366 char_len: char_len
2367 last_dot_pc: state.pc
2368 }
2369 // if we are managing a \[something]* stay on the same char on return
2370 if re.prog[state.pc].rep_min == 0 {
2371 dot_state.i -= char_len
2372 }
2373
2374 re.state_list << dot_state
2375
2376 m_state = .ist_quant_n
2377 // println("dot_char stack len: ${re.state_list.len}")
2378 continue
2379 }
2380
2381 tmp_res := re.prog[state.pc].validator(u8(ch))
2382 if tmp_res == false {
2383 m_state = .ist_quant_n
2384 continue
2385 }
2386 // println("${ch} => ${tmp_res}")
2387
2388 state.match_flag = true
2389 l_ist = u32(regex.ist_dot_char)
2390
2391 if state.first_match < 0 {
2392 state.first_match = state.i
2393 }
2394 state.match_index = state.i
2395 re.prog[state.pc].rep++ // increase repetitions
2396
2397 state.i += char_len
2398 m_state = .ist_quant_p
2399 continue
2400 }
2401 // simple char IST
2402 else if ist == regex.ist_simple_char {
2403 // println("ist_simple_char")
2404 state.match_flag = false
2405
2406 if re.prog[state.pc].ch == ch {
2407 state.match_flag = true
2408 l_ist = regex.ist_simple_char
2409
2410 if state.first_match < 0 {
2411 state.first_match = state.i
2412 }
2413 // println("state.match_index: ${state.match_index}")
2414 state.match_index = state.i
2415
2416 re.prog[state.pc].rep++ // increase repetitions
2417 state.i += char_len // next char
2418 m_state = .ist_quant_p
2419 continue
2420 }
2421 m_state = .ist_quant_n
2422 continue
2423 }
2424 // UNREACHABLE
2425 // println("PANIC2!! state: $m_state")
2426 return regex.err_internal_error, state.i
2427 }
2428 /***********************************
2429 * Quantifier management
2430 ***********************************/
2431 // ist_quant_ng => quantifier negative test on group
2432 else if m_state == .ist_quant_ng {
2433 // we are finished here
2434 if state.group_index < 0 {
2435 // println("Early stop!")
2436 result = regex.no_match_found
2437 m_state = .stop
2438 continue
2439 }
2440
2441 tmp_pc := re.group_data[state.group_index] // PC to the end of the group token
2442 rep := re.prog[tmp_pc].group_rep // use a temp variable
2443 re.prog[tmp_pc].group_rep = 0 // clear the repetitions
2444
2445 // println(".ist_quant_ng group_pc_end: $tmp_pc rep: $rep")
2446
2447 if rep >= re.prog[tmp_pc].rep_min {
2448 // println("ist_quant_ng GROUP CLOSED OK group_index: $state.group_index")
2449
2450 state.i = re.group_stack[state.group_index]
2451 state.pc = tmp_pc
2452 state.group_index--
2453 m_state = .ist_next
2454 continue
2455 } else if re.prog[tmp_pc].next_is_or {
2456 // println("ist_quant_ng OR Negative branch")
2457
2458 state.i = re.group_stack[state.group_index]
2459 state.pc = re.prog[tmp_pc + 1].rep_min - 1
2460 state.group_index--
2461 m_state = .ist_next
2462 continue
2463 } else if rep > 0 && rep < re.prog[tmp_pc].rep_min {
2464 // println("ist_quant_ng UNDER THE MINIMUM g.i: $state.group_index")
2465
2466 // check if we are inside a group, if yes exit from the nested groups
2467 if state.group_index > 0 {
2468 state.group_index--
2469 state.pc = tmp_pc
2470 m_state = .ist_quant_ng //.ist_next
2471 continue
2472 }
2473
2474 if state.group_index == 0 {
2475 state.group_index--
2476 state.pc = tmp_pc // TEST
2477 m_state = .ist_next
2478 continue
2479 }
2480
2481 result = regex.no_match_found
2482 m_state = .stop
2483 continue
2484 } else if rep == 0 && rep < re.prog[tmp_pc].rep_min {
2485 // println("ist_quant_ng c_zero UNDER THE MINIMUM g.i: $state.group_index")
2486
2487 if state.group_index > 0 {
2488 state.group_index--
2489 state.pc = tmp_pc
2490 m_state = .ist_quant_ng //.ist_next
2491 continue
2492 }
2493
2494 result = regex.no_match_found
2495 m_state = .stop
2496 continue
2497 }
2498
2499 // println("DO NOT STAY HERE!! {${re.prog[tmp_pc].rep_min},${re.prog[tmp_pc].rep_max}}:$rep")
2500 // UNREACHABLE
2501 return regex.err_internal_error, state.i
2502 }
2503 // ist_quant_pg => quantifier positive test on group
2504 else if m_state == .ist_quant_pg {
2505 // println(".ist_quant_pg")
2506 mut tmp_pc := state.pc
2507 if state.group_index >= 0 {
2508 tmp_pc = re.group_data[state.group_index]
2509 }
2510
2511 if re.prog[tmp_pc].group_neg == true {
2512 // println("***** Negation of the group")
2513 result = regex.no_match_found
2514 m_state = .stop
2515 continue
2516 }
2517
2518 rep := re.prog[tmp_pc].group_rep
2519
2520 if rep < re.prog[tmp_pc].rep_min {
2521 // println("ist_quant_pg UNDER RANGE")
2522 state.pc = re.prog[tmp_pc].goto_pc
2523 m_state = .ist_next
2524 continue
2525 } else if rep == re.prog[tmp_pc].rep_max {
2526 // println("ist_quant_pg MAX RANGE")
2527 re.prog[tmp_pc].group_rep = 0 // clear the repetitions
2528 state.group_index--
2529 m_state = .ist_next
2530
2531 continue
2532 } else if rep >= re.prog[tmp_pc].rep_min {
2533 // println("ist_quant_pg IN RANGE group_index:$state.group_index")
2534
2535 // check greedy flag, if true exit on minimum
2536 if re.prog[tmp_pc].greedy == true {
2537 re.prog[tmp_pc].group_rep = 0 // clear the repetitions
2538 state.group_index--
2539 m_state = .ist_next
2540 continue
2541 }
2542
2543 state.pc = re.prog[tmp_pc].goto_pc - 1
2544 state.group_index--
2545 m_state = .ist_next
2546 continue
2547 }
2548
2549 // UNREACHABLE
2550 // println("PANIC3!! state: $m_state")
2551 return regex.err_internal_error, state.i
2552 }
2553 // ist_quant_n => quantifier negative test on token
2554 else if m_state == .ist_quant_n {
2555 rep := re.prog[state.pc].rep
2556 // println("Here!! PC $state.pc is_next_or: ${re.prog[state.pc].next_is_or}")
2557
2558 // zero quantifier * or ?
2559 if rep == 0 && re.prog[state.pc].rep_min == 0 {
2560 // println("ist_quant_n c_zero RANGE MIN")
2561 m_state = .ist_next // go to next ist
2562 continue
2563 }
2564 // match + or *
2565 else if rep >= re.prog[state.pc].rep_min {
2566 // println("ist_quant_n MATCH RANGE")
2567 m_state = .ist_next
2568 continue
2569 }
2570
2571 // check the OR if present
2572 if re.prog[state.pc].next_is_or {
2573 // println("OR present on failing")
2574 state.match_index = -1
2575 m_state = .ist_next
2576 continue
2577 }
2578
2579 // we are in a group manage no match from here
2580 if state.group_index >= 0 {
2581 // println("ist_quant_n FAILED insied a GROUP group_index:$state.group_index")
2582 m_state = .ist_quant_ng
2583 continue
2584 }
2585
2586 // no other options
2587 // println("ist_quant_n no_match_found")
2588 result = regex.no_match_found
2589 m_state = .stop
2590
2591 // stop already started matching outside a capturing group
2592 if re.state_list.len > 0 && re.state_list.last().group_index == -1
2593 && re.state_list.last().last_dot_pc > 0 {
2594 if ist == regex.ist_dot_char || ist == regex.ist_bsls_char {
2595 return regex.no_match_found, 0
2596 }
2597 }
2598 continue
2599 }
2600 // ist_quant_p => quantifier positive test on token
2601 else if m_state == .ist_quant_p {
2602 // println("Here .ist_quant_p")
2603 // exit on first match
2604 if (re.flag & regex.f_efm) != 0 {
2605 return state.i, state.i + 1
2606 }
2607
2608 rep := re.prog[state.pc].rep
2609 // println(rep)
2610
2611 // under range
2612 if rep > 0 && rep < re.prog[state.pc].rep_min {
2613 // println("ist_quant_p UNDER RANGE")
2614 m_state = .ist_load // continue the loop
2615 continue
2616 }
2617 // range ok, continue loop
2618 else if rep >= re.prog[state.pc].rep_min && rep < re.prog[state.pc].rep_max {
2619 // println("ist_quant_p IN RANGE")
2620
2621 // check greedy flag, if true exit on minimum
2622 if re.prog[state.pc].greedy == true {
2623 m_state = .ist_next
2624 continue
2625 }
2626 m_state = .ist_load
2627 continue
2628 }
2629 // max reached
2630 else if rep == re.prog[state.pc].rep_max {
2631 // println("ist_quant_p MAX RANGE")
2632 m_state = .ist_next
2633 continue
2634 }
2635 }
2636 // UNREACHABLE
2637 // println("PANIC4!! state: $m_state")
2638 return regex.err_internal_error, state.i
2639 }
2640
2641 // println("Check end of text!")
2642 // Check the results
2643 if state.match_index >= 0 {
2644 if state.group_index < 0 {
2645 if re.prog[state.pc].ist == regex.ist_prog_end {
2646 // println("program ended!!")
2647
2648 if (re.flag & regex.f_src) != 0 {
2649 // println("find return")
2650 return state.first_match, state.i
2651 } else {
2652 // println("Here!!")
2653 return 0, state.i
2654 }
2655 }
2656
2657 // println("No Group here, natural end [$state.first_match,$state.i] state: ${state_str(m_state)} ist: $ist pgr_end: $re.prog.len")
2658
2659 if re.prog[state.pc + 1].ist == regex.ist_prog_end
2660 || re.prog[state.pc].ist == regex.ist_prog_end {
2661 rep := re.prog[state.pc].rep
2662 // println("rep: $rep re.prog[state.pc].rep_min: ${re.prog[state.pc].rep_min} re.prog[state.pc].rep_max: ${re.prog[state.pc].rep_max}")
2663 if rep >= re.prog[state.pc].rep_min && rep <= re.prog[state.pc].rep_max {
2664 return state.first_match, state.i
2665 }
2666 // println("Program not finished! ")
2667 return regex.no_match_found, state.i
2668 }
2669 if src_end {
2670 // println("program end")
2671 return state.first_match, state.i
2672 }
2673 // print("No match found!!")
2674 return regex.no_match_found, state.i
2675 } else {
2676 // println("Group match! OK")
2677 // println("first_match: $state.first_match, i: $state.i")
2678
2679 // println("Skip last group")
2680 return state.first_match, state.i
2681 // return state.first_match,re.group_stack[state.group_index--]
2682 }
2683 }
2684 // println("no_match_found, natural end")
2685 return regex.no_match_found, state.i
2686}