Gitly

v / vlib / regex
Raw file | 2686 loc (2356 sloc) | 67.57 KB | Latest commit hash da68b2d36

1 /*
2 regex 1.0 alpha
3 
4 Copyright (c) 2019-2023 Dario Deledda. All rights reserved.
5 Use of this source code is governed by an MIT license
6 that can be found in the LICENSE file.
7 
8 This file contains regex module
9 
10 Know limitation:
11 - find is implemented in a trivial way
12 - not full compliant PCRE
13 - not compliant POSIX ERE
14 */
15 module regex
16 
17 import strings
18 
19 pub const (
20     v_regex_version          = '1.0 alpha' // regex module version
21 
22     max_code_len             = 256 // default small base code len for the regex programs
23     max_quantifier           = 1073741824 // default max repetitions allowed for the quantifiers = 2^30
24     // spaces chars (here only westerns!!) TODO: manage all the spaces from unicode
25     spaces                   = [` `, `\t`, `\n`, `\r`, `\v`, `\f`]
26     // new line chars for now only '\n'
27     new_line_list            = [`\n`, `\r`]
28 
29     // Results
30     no_match_found           = -1
31 
32     // Errors
33     compile_ok               = 0 // the regex string compiled, all ok
34     err_char_unknown         = -2 // the char used is unknow to the system
35     err_undefined            = -3 // the compiler symbol is undefined
36     err_internal_error       = -4 // Bug in the regex system!!
37     err_cc_alloc_overflow    = -5 // memory for char class full!!
38     err_syntax_error         = -6 // syntax error in regex compiling
39     err_groups_overflow      = -7 // max number of groups reached
40     err_groups_max_nested    = -8 // max number of nested group reached
41     err_group_not_balanced   = -9 // group not balanced
42     err_group_qm_notation    = -10 // group invalid notation
43     err_invalid_or_with_cc   = -11 // invalid or on two consecutive char class
44     err_neg_group_quantifier = -12 // negation groups can not have quantifier
45     err_consecutive_dots     = -13 // two consecutive dots is an error
46 )
47 
48 const (
49     //*************************************
50     // regex program instructions
51     //*************************************
52     ist_simple_char    = u32(0x7FFFFFFF) // single char instruction, 31 bit available to char
53     // char class 11 0100 AA xxxxxxxx
54     // AA = 00  regular class
55     // AA = 01  Negated class ^ char
56     ist_char_class     = u32(0xD1000000) // MASK
57     ist_char_class_pos = u32(0xD0000000) // char class normal [abc]
58     ist_char_class_neg = u32(0xD1000000) // char class negate [^abc]
59     // dot char        10 0110 xx xxxxxxxx
60     ist_dot_char       = u32(0x98000000) // match any char except \n
61     // backslash chars 10 0100 xx xxxxxxxx
62     ist_bsls_char      = u32(0x90000000) // backslash char
63     // OR |            10 010Y xx xxxxxxxx
64     ist_or_branch      = u32(0x91000000) // OR case
65     // groups          10 010Y xx xxxxxxxx
66     ist_group_start    = u32(0x92000000) // group start (
67     ist_group_end      = u32(0x94000000) // group end   )
68     // control instructions
69     ist_prog_end       = u32(0x88000000) // 10 0010 xx xxxxxxxx
70         //*************************************
71 )
72 
73 /*
74 General Utilities
75 */
76 // utf8util_char_len calculate the length in bytes of a utf8 char
77 [inline]
78 fn utf8util_char_len(b u8) int {
79     return ((0xe5000000 >> ((b >> 3) & 0x1e)) & 3) + 1
80 }
81 
82 // get_char get a char from position i and return an u32 with the unicode code
83 [direct_array_access; inline]
84 fn (re RE) get_char(in_txt string, i int) (u32, int) {
85     ini := unsafe { in_txt.str[i] }
86     // ascii 8 bit
87     if (re.flag & regex.f_bin) != 0 || ini & 0x80 == 0 {
88         return u32(ini), 1
89     }
90     // unicode char
91     char_len := utf8util_char_len(ini)
92     mut tmp := 0
93     mut ch := u32(0)
94     for tmp < char_len {
95         ch = (ch << 8) | unsafe { in_txt.str[i + tmp] }
96         tmp++
97     }
98     return ch, char_len
99 }
100 
101 // get_charb get a char from position i and return an u32 with the unicode code
102 [direct_array_access; inline]
103 fn (re RE) get_charb(in_txt &u8, i int) (u32, int) {
104     // ascii 8 bit
105     if (re.flag & regex.f_bin) != 0 || unsafe { in_txt[i] } & 0x80 == 0 {
106         return u32(unsafe { in_txt[i] }), 1
107     }
108     // unicode char
109     char_len := utf8util_char_len(unsafe { in_txt[i] })
110     mut tmp := 0
111     mut ch := u32(0)
112     for tmp < char_len {
113         ch = (ch << 8) | unsafe { in_txt[i + tmp] }
114         tmp++
115     }
116     return ch, char_len
117 }
118 
119 [inline]
120 fn is_alnum(in_char u8) bool {
121     mut tmp := in_char - `A`
122     if tmp <= 25 {
123         return true
124     }
125     tmp = in_char - `a`
126     if tmp <= 25 {
127         return true
128     }
129     tmp = in_char - `0`
130     if tmp <= 9 {
131         return true
132     }
133     if in_char == `_` {
134         return true
135     }
136     return false
137 }
138 
139 [inline]
140 fn is_not_alnum(in_char u8) bool {
141     return !is_alnum(in_char)
142 }
143 
144 [inline]
145 fn is_space(in_char u8) bool {
146     return in_char in regex.spaces
147 }
148 
149 [inline]
150 fn is_not_space(in_char u8) bool {
151     return !is_space(in_char)
152 }
153 
154 [inline]
155 fn is_digit(in_char u8) bool {
156     tmp := in_char - `0`
157     return tmp <= 0x09
158 }
159 
160 [inline]
161 fn is_not_digit(in_char u8) bool {
162     return !is_digit(in_char)
163 }
164 
165 /*
166 [inline]
167 fn is_wordchar(in_char byte) bool {
168     return is_alnum(in_char) || in_char == `_`
169 }
170 
171 [inline]
172 fn is_not_wordchar(in_char byte) bool {
173     return !is_alnum(in_char)
174 }
175 */
176 
177 [inline]
178 fn is_lower(in_char u8) bool {
179     tmp := in_char - `a`
180     return tmp <= 25
181 }
182 
183 [inline]
184 fn is_upper(in_char u8) bool {
185     tmp := in_char - `A`
186     return tmp <= 25
187 }
188 
189 pub fn (re RE) get_parse_error_string(err int) string {
190     match err {
191         regex.compile_ok { return 'compile_ok' }
192         regex.no_match_found { return 'no_match_found' }
193         regex.err_char_unknown { return 'err_char_unknown' }
194         regex.err_undefined { return 'err_undefined' }
195         regex.err_internal_error { return 'err_internal_error' }
196         regex.err_cc_alloc_overflow { return 'err_cc_alloc_overflow' }
197         regex.err_syntax_error { return 'err_syntax_error' }
198         regex.err_groups_overflow { return 'err_groups_overflow' }
199         regex.err_groups_max_nested { return 'err_groups_max_nested' }
200         regex.err_group_not_balanced { return 'err_group_not_balanced' }
201         regex.err_group_qm_notation { return 'err_group_qm_notation' }
202         regex.err_invalid_or_with_cc { return 'err_invalid_or_with_cc' }
203         regex.err_neg_group_quantifier { return 'err_neg_group_quantifier' }
204         regex.err_consecutive_dots { return 'err_consecutive_dots' }
205         else { return 'err_unknown' }
206     }
207 }
208 
209 // utf8_str convert and utf8 sequence to a printable string
210 [inline]
211 fn utf8_str(ch rune) string {
212     mut i := 4
213     mut res := ''
214     for i > 0 {
215         v := u8((ch >> ((i - 1) * 8)) & 0xFF)
216         if v != 0 {
217             res += '${v:1c}'
218         }
219         i--
220     }
221     return res
222 }
223 
224 // simple_log default log function
225 fn simple_log(txt string) {
226     print(txt)
227 }
228 
229 /******************************************************************************
230 *
231 * Token Structs
232 *
233 ******************************************************************************/
234 pub type FnValidator = fn (u8) bool
235 
236 struct Token {
237 mut:
238     ist rune
239     // char
240     ch     rune // char of the token if any
241     ch_len u8   // char len
242     // Quantifiers / branch
243     rep_min int  // used also for jump next in the OR branch [no match] pc jump
244     rep_max int  // used also for jump next in the OR branch [   match] pc jump
245     greedy  bool // greedy quantifier flag
246     // Char class
247     cc_index int = -1
248     // counters for quantifier check (repetitions)
249     rep int
250     // validator function pointer
251     validator FnValidator = unsafe { nil }
252     // groups variables
253     group_neg bool // negation flag for the group, 0 => no negation > 0 => negataion
254     group_rep int  // repetition of the group
255     group_id  int = -1 // id of the group
256     goto_pc   int = -1 // jump to this PC if is needed
257     // OR flag for the token
258     next_is_or bool // true if the next token is an OR
259     // dot_char token variables
260     dot_check_pc  int = -1 // pc of the next token to check for dots
261     bsls_check_pc int = -1 // pc of the next token to check for bsls
262     cc_check_pc   int = -1 // pc of the next token to check for CC
263     last_dot_flag bool // if true indicate that is the last dot_char in the regex
264     // debug fields
265     source_index int
266 }
267 
268 [inline]
269 fn (mut tok Token) reset() {
270     tok.rep = 0
271 }
272 
273 /******************************************************************************
274 *
275 * Regex struct
276 *
277 ******************************************************************************/
278 pub const (
279     f_nl  = 0x00000001 // end the match when find a new line symbol
280     f_ms  = 0x00000002 // match true only if the match is at the start of the string
281     f_me  = 0x00000004 // match true only if the match is at the end of the string
282 
283     f_efm = 0x00000100 // exit on first token matched, used by search
284     f_bin = 0x00000200 // work only on bytes, ignore utf-8
285     // behaviour modifier flags
286     f_src = 0x00020000 // search mode enabled
287 )
288 
289 // Log function prototype
290 pub type FnLog = fn (string)
291 
292 pub struct RE {
293 pub mut:
294     prog     []Token
295     prog_len int // regex program len
296     // char classes storage
297     cc       []CharClass // char class list
298     cc_index int // index
299     // groups
300     group_count      int   // number of groups in this regex struct
301     groups           []int // groups index results
302     group_max_nested int = 3 // max nested group
303     group_max        int = 8 // max allowed number of different groups
304 
305     state_list []StateObj
306 
307     group_csave_flag bool  // flag to enable continuous saving
308     group_csave      []int //= []int{}  // groups continuous save list
309 
310     group_map map[string]int // groups names map
311 
312     group_stack []int
313     group_data  []int
314     // flags
315     flag int // flag for optional parameters
316     // Debug/log
317     debug    int    // enable in order to have the unroll of the code 0 = NO_DEBUG, 1 = LIGHT 2 = VERBOSE
318     log_func FnLog = simple_log // log function, can be customized by the user
319     query    string // query string
320 }
321 
322 // Reset RE object
323 [direct_array_access; inline]
324 pub fn (mut re RE) reset() {
325     re.cc_index = 0
326 
327     mut i := 0
328     for i < re.prog_len {
329         re.prog[i].group_rep = 0 // clear repetition of the group
330         re.prog[i].rep = 0 // clear repetition of the token
331         i++
332     }
333 
334     // init groups array
335     if re.group_count > 0 {
336         if re.groups.len == 0 {
337             // first run alloc memory
338             re.groups = []int{len: re.group_count * 2, init: -1}
339         } else {
340             // subsequent executions, only clean up the memory
341             i = 0
342             for i < re.groups.len {
343                 re.groups[i] = -1
344                 i++
345             }
346         }
347     }
348 
349     // reset group_csave
350     if re.group_csave_flag == true {
351         re.group_csave.clear() // = []int{}
352     }
353 
354     // reset state list
355     re.state_list.clear()
356     re.group_stack.clear()
357 }
358 
359 // reset for search mode fail
360 // gcc bug, dont use [inline] or go 5 time slower
361 //[inline]
362 [direct_array_access]
363 fn (mut re RE) reset_src() {
364     mut i := 0
365     for i < re.prog_len {
366         re.prog[i].group_rep = 0 // clear repetition of the group
367         re.prog[i].rep = 0 // clear repetition of the token
368         i++
369     }
370 }
371 
372 /******************************************************************************
373 *
374 * Backslashes chars
375 *
376 ******************************************************************************/
377 struct BslsStruct {
378     ch        rune        // meta char
379     validator FnValidator = unsafe { nil } // validator function pointer
380 }
381 
382 const (
383     bsls_validator_array = [
384         BslsStruct{`w`, is_alnum},
385         BslsStruct{`W`, is_not_alnum},
386         BslsStruct{`s`, is_space},
387         BslsStruct{`S`, is_not_space},
388         BslsStruct{`d`, is_digit},
389         BslsStruct{`D`, is_not_digit},
390         BslsStruct{`a`, is_lower},
391         BslsStruct{`A`, is_upper},
392     ]
393 
394     // these chars are escape if preceded by a \
395     bsls_escape_list     = [`\\`, `|`, `.`, `:`, `*`, `+`, `-`, `{`, `}`, `[`, `]`, `(`, `)`, `?`,
396         `^`, `!`]
397 )
398 
399 enum BSLS_parse_state {
400     start
401     bsls_found
402     bsls_char
403     normal_char
404 }
405 
406 // parse_bsls return (index, str_len) bsls_validator_array index, len of the backslash sequence if present
407 fn (re RE) parse_bsls(in_txt string, in_i int) (int, int) {
408     mut status := BSLS_parse_state.start
409     mut i := in_i
410 
411     for i < in_txt.len {
412         // get our char
413         char_tmp, char_len := re.get_char(in_txt, i)
414         ch := u8(char_tmp)
415 
416         if status == .start && ch == `\\` {
417             status = .bsls_found
418             i += char_len
419             continue
420         }
421 
422         // check if is our bsls char, for now only one length sequence
423         if status == .bsls_found {
424             for c, x in regex.bsls_validator_array {
425                 if x.ch == ch {
426                     return c, i - in_i + 1
427                 }
428             }
429             status = .normal_char
430             continue
431         }
432 
433         // no BSLS validator, manage as normal escape char char
434         if status == .normal_char {
435             if ch in regex.bsls_escape_list {
436                 return regex.no_match_found, i - in_i + 1
437             }
438             return regex.err_syntax_error, i - in_i + 1
439         }
440 
441         // at the present time we manage only one char after the \
442         break
443     }
444     // not our bsls return KO
445     return regex.err_syntax_error, i
446 }
447 
448 /******************************************************************************
449 *
450 * Char class
451 *
452 ******************************************************************************/
453 const (
454     cc_null = 0 // empty cc token
455     cc_char = 1 // simple char: a
456     cc_int  = 2 // char interval: a-z
457     cc_bsls = 3 // backslash char
458     cc_end  = 4 // cc sequence terminator
459 )
460 
461 struct CharClass {
462 mut:
463     cc_type   int         // type of cc token
464     ch0       rune        // first char of the interval a-b  a in this case
465     ch1       rune        // second char of the interval a-b b in this case
466     validator FnValidator = unsafe { nil } // validator function pointer
467 }
468 
469 enum CharClass_parse_state {
470     start
471     in_char
472     in_bsls
473     separator
474     finish
475 }
476 
477 fn (re RE) get_char_class(pc int) string {
478     buf := []u8{len: (re.cc.len)}
479     mut buf_ptr := unsafe { &u8(&buf) }
480 
481     mut cc_i := re.prog[pc].cc_index
482     mut i := 0
483     mut tmp := 0
484     for cc_i >= 0 && cc_i < re.cc.len && re.cc[cc_i].cc_type != regex.cc_end {
485         if re.cc[cc_i].cc_type == regex.cc_bsls {
486             unsafe {
487                 buf_ptr[i] = `\\`
488                 i++
489                 buf_ptr[i] = u8(re.cc[cc_i].ch0)
490                 i++
491             }
492         } else if re.cc[cc_i].ch0 == re.cc[cc_i].ch1 {
493             tmp = 3
494             for tmp >= 0 {
495                 x := u8((re.cc[cc_i].ch0 >> (tmp * 8)) & 0xFF)
496                 if x != 0 {
497                     unsafe {
498                         buf_ptr[i] = x
499                         i++
500                     }
501                 }
502                 tmp--
503             }
504         } else {
505             tmp = 3
506             for tmp >= 0 {
507                 x := u8((re.cc[cc_i].ch0 >> (tmp * 8)) & 0xFF)
508                 if x != 0 {
509                     unsafe {
510                         buf_ptr[i] = x
511                         i++
512                     }
513                 }
514                 tmp--
515             }
516             unsafe {
517                 buf_ptr[i] = `-`
518                 i++
519             }
520             tmp = 3
521             for tmp >= 0 {
522                 x := u8((re.cc[cc_i].ch1 >> (tmp * 8)) & 0xFF)
523                 if x != 0 {
524                     unsafe {
525                         buf_ptr[i] = x
526                         i++
527                     }
528                 }
529                 tmp--
530             }
531         }
532         cc_i++
533     }
534     unsafe {
535         buf_ptr[i] = u8(0)
536     }
537     return unsafe { tos_clone(buf_ptr) }
538 }
539 
540 fn (re RE) check_char_class(pc int, ch rune) bool {
541     mut cc_i := re.prog[pc].cc_index
542     for cc_i >= 0 && cc_i < re.cc.len && re.cc[cc_i].cc_type != regex.cc_end {
543         if re.cc[cc_i].cc_type == regex.cc_bsls {
544             if re.cc[cc_i].validator(u8(ch)) {
545                 return true
546             }
547         } else if ch >= re.cc[cc_i].ch0 && ch <= re.cc[cc_i].ch1 {
548             return true
549         }
550         cc_i++
551     }
552     return false
553 }
554 
555 // parse_char_class return (index, str_len, cc_type) of a char class [abcm-p], char class start after the [ char
556 fn (mut re RE) parse_char_class(in_txt string, in_i int) (int, int, rune) {
557     mut status := CharClass_parse_state.start
558     mut i := in_i
559 
560     mut tmp_index := re.cc_index
561     res_index := re.cc_index
562 
563     mut cc_type := u32(regex.ist_char_class_pos)
564 
565     for i < in_txt.len {
566         // check if we are out of memory for char classes
567         if tmp_index >= re.cc.len {
568             return regex.err_cc_alloc_overflow, 0, u32(0)
569         }
570 
571         // get our char
572         char_tmp, char_len := re.get_char(in_txt, i)
573         ch := u8(char_tmp)
574 
575         // println("CC #${i:3d} ch: ${ch:c}")
576 
577         // negation
578         if status == .start && ch == `^` {
579             cc_type = u32(regex.ist_char_class_neg)
580             i += char_len
581             continue
582         }
583 
584         // minus symbol
585         if status == .start && ch == `-` {
586             re.cc[tmp_index].cc_type = regex.cc_char
587             re.cc[tmp_index].ch0 = char_tmp
588             re.cc[tmp_index].ch1 = char_tmp
589             i += char_len
590             tmp_index++
591             continue
592         }
593 
594         // bsls
595         if (status == .start || status == .in_char) && ch == `\\` {
596             // println("CC bsls.")
597             status = .in_bsls
598             i += char_len
599             continue
600         }
601 
602         if status == .in_bsls {
603             // println("CC bsls validation.")
604             for c, x in regex.bsls_validator_array {
605                 if x.ch == ch {
606                     // println("CC bsls found [${ch:c}]")
607                     re.cc[tmp_index].cc_type = regex.cc_bsls
608                     re.cc[tmp_index].ch0 = regex.bsls_validator_array[c].ch
609                     re.cc[tmp_index].ch1 = regex.bsls_validator_array[c].ch
610                     re.cc[tmp_index].validator = regex.bsls_validator_array[c].validator
611                     i += char_len
612                     tmp_index++
613                     status = .in_char
614                     break
615                 }
616             }
617             if status == .in_bsls {
618                 // manage as a simple char
619                 // println("CC bsls not found [${ch:c}]")
620                 re.cc[tmp_index].cc_type = regex.cc_char
621                 re.cc[tmp_index].ch0 = char_tmp
622                 re.cc[tmp_index].ch1 = char_tmp
623                 i += char_len
624                 tmp_index++
625                 status = .in_char
626                 continue
627             } else {
628                 continue
629             }
630         }
631 
632         // simple char
633         if (status == .start || status == .in_char) && ch != `-` && ch != `]` {
634             status = .in_char
635 
636             re.cc[tmp_index].cc_type = regex.cc_char
637             re.cc[tmp_index].ch0 = char_tmp
638             re.cc[tmp_index].ch1 = char_tmp
639 
640             i += char_len
641             tmp_index++
642             continue
643         }
644 
645         // check range separator
646         if status == .in_char && ch == `-` {
647             status = .separator
648             i += char_len
649             continue
650         }
651 
652         // check range end
653         if status == .separator && ch != `]` && ch != `-` {
654             status = .in_char
655             re.cc[tmp_index - 1].cc_type = regex.cc_int
656             re.cc[tmp_index - 1].ch1 = char_tmp
657             i += char_len
658             continue
659         }
660 
661         // char class end
662         if status == .in_char && ch == `]` {
663             re.cc[tmp_index].cc_type = regex.cc_end
664             re.cc[tmp_index].ch0 = 0
665             re.cc[tmp_index].ch1 = 0
666             re.cc_index = tmp_index + 1
667 
668             return res_index, i - in_i + 2, cc_type
669         }
670 
671         i++
672     }
673     return regex.err_syntax_error, 0, u32(0)
674 }
675 
676 /******************************************************************************
677 *
678 * Re Compiler
679 *
680 ******************************************************************************/
681 //
682 // Quantifier
683 //
684 enum Quant_parse_state {
685     start
686     min_parse
687     comma_checked
688     max_parse
689     greedy
690     gredy_parse
691     finish
692 }
693 
694 // parse_quantifier return (min, max, str_len, greedy_flag) of a {min,max}? quantifier starting after the { char
695 fn (re RE) parse_quantifier(in_txt string, in_i int) (int, int, int, bool) {
696     mut status := Quant_parse_state.start
697     mut i := in_i
698 
699     mut q_min := 0 // default min in a {} quantifier is 1
700     mut q_max := 0 // default max in a {} quantifier is max_quantifier
701 
702     mut ch := u8(0)
703 
704     for i < in_txt.len {
705         unsafe {
706             ch = in_txt.str[i]
707         }
708         // println("${ch:c} status: $status")
709 
710         // exit on no compatible char with {} quantifier
711         if utf8util_char_len(ch) != 1 {
712             return regex.err_syntax_error, i, 0, false
713         }
714 
715         // min parsing skip if comma present
716         if status == .start && ch == `,` {
717             q_min = 0 // default min in a {} quantifier is 0
718             status = .comma_checked
719             i++
720             continue
721         }
722 
723         if status == .start && is_digit(ch) {
724             status = .min_parse
725             q_min *= 10
726             q_min += int(ch - `0`)
727             i++
728             continue
729         }
730 
731         if status == .min_parse && is_digit(ch) {
732             q_min *= 10
733             q_min += int(ch - `0`)
734             i++
735             continue
736         }
737 
738         // we have parsed the min, now check the max
739         if status == .min_parse && ch == `,` {
740             status = .comma_checked
741             i++
742             continue
743         }
744 
745         // single value {4}
746         if status == .min_parse && ch == `}` {
747             q_max = q_min
748             status = .greedy
749             continue
750         }
751 
752         // end without max
753         if status == .comma_checked && ch == `}` {
754             q_max = regex.max_quantifier
755             status = .greedy
756             continue
757         }
758 
759         // start max parsing
760         if status == .comma_checked && is_digit(ch) {
761             status = .max_parse
762             q_max *= 10
763             q_max += int(ch - `0`)
764             i++
765             continue
766         }
767 
768         // parse the max
769         if status == .max_parse && is_digit(ch) {
770             q_max *= 10
771             q_max += int(ch - `0`)
772             i++
773             continue
774         }
775 
776         // finished the quantifier
777         if status == .max_parse && ch == `}` {
778             status = .greedy
779             continue
780         }
781 
782         // check if greedy flag char ? is present
783         if status == .greedy {
784             if i + 1 < in_txt.len {
785                 i++
786                 status = .gredy_parse
787                 continue
788             }
789             return q_min, q_max, i - in_i + 2, false
790         }
791 
792         // check the greedy flag
793         if status == .gredy_parse {
794             if ch == `?` {
795                 return q_min, q_max, i - in_i + 2, true
796             } else {
797                 i--
798                 return q_min, q_max, i - in_i + 2, false
799             }
800         }
801 
802         // not  a {} quantifier, exit
803         return regex.err_syntax_error, i, 0, false
804     }
805 
806     // not a conform {} quantifier
807     return regex.err_syntax_error, i, 0, false
808 }
809 
810 //
811 // Groups
812 //
813 enum Group_parse_state {
814     start
815     q_mark // (?
816     q_mark1 // (?:|P  checking
817     p_status // (?P
818     p_start // (?P<
819     p_end // (?P<...>
820     p_in_name // (?P<...
821     finish
822 }
823 
824 // parse_groups parse a group for ? (question mark) syntax, if found, return (error, capture_flag, negate_flag, name_of_the_group, next_index)
825 fn (re RE) parse_groups(in_txt string, in_i int) (int, bool, bool, string, int) {
826     mut status := Group_parse_state.start
827     mut i := in_i
828     mut name := ''
829 
830     for i < in_txt.len && status != .finish {
831         // get our char
832         char_tmp, char_len := re.get_char(in_txt, i)
833         ch := u8(char_tmp)
834 
835         // start
836         if status == .start && ch == `(` {
837             status = .q_mark
838             i += char_len
839             continue
840         }
841 
842         // check for question marks
843         if status == .q_mark && ch == `?` {
844             status = .q_mark1
845             i += char_len
846             continue
847         }
848 
849         // negate group
850         if status == .q_mark1 && ch == `!` {
851             i += char_len
852             return 0, false, true, name, i
853         }
854 
855         // non capturing group
856         if status == .q_mark1 && ch == `:` {
857             i += char_len
858             return 0, false, false, name, i
859         }
860 
861         // enter in P section
862         if status == .q_mark1 && ch == `P` {
863             status = .p_status
864             i += char_len
865             continue
866         }
867 
868         // not a valid q mark found
869         if status == .q_mark1 {
870             // println("NO VALID Q MARK")
871             return -2, true, false, name, i
872         }
873 
874         if status == .p_status && ch == `<` {
875             status = .p_start
876             i += char_len
877             continue
878         }
879 
880         if status == .p_start && ch != `>` {
881             status = .p_in_name
882             name += '${ch:1c}' // TODO: manage utf8 chars
883             i += char_len
884             continue
885         }
886 
887         // colect name
888         if status == .p_in_name && ch != `>` && is_alnum(ch) {
889             name += '${ch:1c}' // TODO: manage utf8 chars
890             i += char_len
891             continue
892         }
893 
894         // end name
895         if status == .p_in_name && ch == `>` {
896             i += char_len
897             return 0, true, false, name, i
898         }
899 
900         // error on name group
901         if status == .p_in_name {
902             return -2, true, false, name, i
903         }
904 
905         // normal group, nothig to do, exit
906         return 0, true, false, name, i
907     }
908     // UNREACHABLE
909     // println("ERROR!! NOT MEANT TO BE HERE!!1")
910     return -2, true, false, name, i
911 }
912 
913 const (
914     quntifier_chars = [rune(`+`), `*`, `?`, `{`]
915 )
916 
917 //
918 // main compiler
919 //
920 // compile return (return code, index) where index is the index of the error in the query string if return code is an error code
921 fn (mut re RE) impl_compile(in_txt string) (int, int) {
922     mut i := 0 // input string index
923     mut pc := 0 // program counter
924 
925     // group management variables
926     mut group_count := -1
927     mut group_stack := []int{len: re.group_max_nested, init: 0}
928     mut group_stack_txt_index := []int{len: re.group_max_nested, init: -1}
929     mut group_stack_index := -1
930 
931     re.query = in_txt // save the query string
932 
933     i = 0
934     for i < in_txt.len {
935         mut char_tmp := u32(0)
936         mut char_len := 0
937         // println("i: ${i:3d} ch: ${in_txt.str[i]:c}")
938 
939         char_tmp, char_len = re.get_char(in_txt, i)
940 
941         //
942         // check special cases: $ ^
943         //
944         if char_len == 1 && i == 0 && u8(char_tmp) == `^` {
945             re.flag = regex.f_ms
946             i = i + char_len
947             continue
948         }
949         if char_len == 1 && i == (in_txt.len - 1) && u8(char_tmp) == `$` {
950             re.flag = regex.f_me
951             i = i + char_len
952             continue
953         }
954 
955         // ist_group_start
956         if char_len == 1 && pc >= 0 && u8(char_tmp) == `(` {
957             // check max groups allowed
958             if group_count > re.group_max {
959                 return regex.err_groups_overflow, i + 1
960             }
961             group_stack_index++
962 
963             // check max nested groups allowed
964             if group_stack_index > re.group_max_nested {
965                 return regex.err_groups_max_nested, i + 1
966             }
967 
968             tmp_res, cgroup_flag, negate_flag, cgroup_name, next_i := re.parse_groups(in_txt,
969                 i)
970 
971             // manage question mark format error
972             if tmp_res < -1 {
973                 return regex.err_group_qm_notation, next_i
974             }
975 
976             // println("Parse group: [$tmp_res, $cgroup_flag, ($i,$next_i), '${in_txt[i..next_i]}' ]")
977             i = next_i
978 
979             if cgroup_flag == true {
980                 group_count++
981             }
982 
983             // calculate the group id
984             // if it is a named group, recycle the group id
985             // NOTE: **** the group index is +1 because map return 0 when not found!! ****
986             mut group_id := group_count
987             if cgroup_name.len > 0 {
988                 // println("GROUP NAME: ${cgroup_name}")
989                 if cgroup_name in re.group_map {
990                     group_id = re.group_map[cgroup_name] - 1
991                     group_count--
992                 } else {
993                     re.group_map[cgroup_name] = group_id + 1
994                 }
995             }
996 
997             group_stack_txt_index[group_stack_index] = i
998             group_stack[group_stack_index] = pc
999 
1000             re.prog[pc].ist = u32(0) | regex.ist_group_start
1001             re.prog[pc].rep_min = 1
1002             re.prog[pc].rep_max = 1
1003 
1004             // manage negation groups
1005             if negate_flag == true {
1006                 re.prog[pc].group_neg = true
1007                 re.prog[pc].rep_min = 0 // may be not catched, but it is ok
1008             }
1009 
1010             // set the group id
1011             if cgroup_flag == false {
1012                 // println("NO CAPTURE GROUP")
1013                 re.prog[pc].group_id = -1
1014             } else {
1015                 re.prog[pc].group_id = group_id
1016             }
1017 
1018             pc = pc + 1
1019             continue
1020         }
1021 
1022         // ist_group_end
1023         if char_len == 1 && pc > 0 && u8(char_tmp) == `)` {
1024             if group_stack_index < 0 {
1025                 return regex.err_group_not_balanced, i + 1
1026             }
1027 
1028             goto_pc := group_stack[group_stack_index]
1029             group_stack_index--
1030 
1031             re.prog[pc].ist = u32(0) | regex.ist_group_end
1032             re.prog[pc].rep_min = 1
1033             re.prog[pc].rep_max = 1
1034 
1035             re.prog[pc].goto_pc = goto_pc // PC where to jump if a group need
1036             re.prog[pc].group_id = re.prog[goto_pc].group_id // id of this group, used for storing data
1037 
1038             re.prog[goto_pc].goto_pc = pc // start goto point to the end group pc
1039             // re.prog[goto_pc].group_id = group_count         // id of this group, used for storing data
1040 
1041             // duplicate the negation group info and settings
1042             if re.prog[goto_pc].group_neg == true {
1043                 re.prog[pc].group_neg = re.prog[goto_pc].group_neg
1044                 re.prog[pc].rep_min = re.prog[goto_pc].rep_min
1045             }
1046 
1047             pc = pc + 1
1048             i = i + char_len
1049             continue
1050         }
1051 
1052         // ist_dot_char match any char except the following token
1053         if char_len == 1 && pc >= 0 && u8(char_tmp) == `.` {
1054             // consecutive ist_dot_char is a syntax error
1055             if pc > 0 && re.prog[pc - 1].ist == regex.ist_dot_char {
1056                 return regex.err_consecutive_dots, i
1057             }
1058 
1059             re.prog[pc].ist = u32(0) | regex.ist_dot_char
1060             re.prog[pc].rep_min = 1
1061             re.prog[pc].rep_max = 1
1062             pc = pc + 1
1063             i = i + char_len
1064             continue
1065         }
1066 
1067         // OR branch
1068         if char_len == 1 && pc > 0 && u8(char_tmp) == `|` {
1069             if pc > 0 && re.prog[pc - 1].ist == regex.ist_or_branch {
1070                 return regex.err_syntax_error, i
1071             }
1072             re.prog[pc].ist = u32(0) | regex.ist_or_branch
1073             re.prog[pc].source_index = i
1074             pc = pc + 1
1075             i = i + char_len
1076             continue
1077         }
1078 
1079         // Quantifiers
1080         if char_len == 1 && pc > 0 {
1081             mut char_next := rune(0)
1082             mut char_next_len := 0
1083             if (char_len + i) < in_txt.len {
1084                 char_next, char_next_len = re.get_char(in_txt, i + char_len)
1085             }
1086             mut quant_flag := true
1087 
1088             // negation groups can not have quantifiers
1089             if re.prog[pc - 1].group_neg == true && char_tmp in [`?`, `+`, `*`, `{`] {
1090                 return regex.err_neg_group_quantifier, i
1091             }
1092 
1093             match u8(char_tmp) {
1094                 `?` {
1095                     // println("q: ${char_tmp:c}")
1096                     // check illegal quantifier sequences
1097                     if char_next_len == 1 && char_next in regex.quntifier_chars {
1098                         return regex.err_syntax_error, i
1099                     }
1100                     re.prog[pc - 1].rep_min = 0
1101                     re.prog[pc - 1].rep_max = 1
1102                 }
1103                 `+` {
1104                     // println("q: ${char_tmp:c}")
1105                     // check illegal quantifier sequences
1106                     if char_next_len == 1 && char_next in regex.quntifier_chars {
1107                         return regex.err_syntax_error, i
1108                     }
1109                     re.prog[pc - 1].rep_min = 1
1110                     re.prog[pc - 1].rep_max = regex.max_quantifier
1111                 }
1112                 `*` {
1113                     // println("q: ${char_tmp:c}")
1114                     // check illegal quantifier sequences
1115                     if char_next_len == 1 && char_next in regex.quntifier_chars {
1116                         return regex.err_syntax_error, i
1117                     }
1118                     re.prog[pc - 1].rep_min = 0
1119                     re.prog[pc - 1].rep_max = regex.max_quantifier
1120                 }
1121                 `{` {
1122                     min, max, tmp, greedy := re.parse_quantifier(in_txt, i + 1)
1123                     // it is a quantifier
1124                     if min >= 0 {
1125                         // println("{$min,$max}\n str:[${in_txt[i..i+tmp]}] greedy:$greedy")
1126                         i = i + tmp
1127                         re.prog[pc - 1].rep_min = min
1128                         re.prog[pc - 1].rep_max = max
1129                         re.prog[pc - 1].greedy = greedy
1130                         // check illegal quantifier sequences
1131                         if i <= in_txt.len {
1132                             char_next, char_next_len = re.get_char(in_txt, i)
1133                             if char_next_len == 1 && char_next in regex.quntifier_chars {
1134                                 return regex.err_syntax_error, i
1135                             }
1136                         }
1137                         continue
1138                     } else {
1139                         return min, i
1140                     }
1141 
1142                     // TODO: decide if the open bracket can be conform without the close bracket
1143                     /*
1144                     // no conform, parse as normal char
1145                     else {
1146                         quant_flag = false
1147                     }
1148                     */
1149                 }
1150                 else {
1151                     quant_flag = false
1152                 }
1153             }
1154 
1155             if quant_flag {
1156                 i = i + char_len
1157                 continue
1158             }
1159         }
1160 
1161         // IST_CHAR_CLASS_*
1162         if char_len == 1 && pc >= 0 {
1163             if u8(char_tmp) == `[` {
1164                 cc_index, tmp, cc_type := re.parse_char_class(in_txt, i + 1)
1165                 if cc_index >= 0 {
1166                     // println("index: $cc_index str:${in_txt[i..i+tmp]}")
1167                     i = i + tmp
1168                     re.prog[pc].ist = u32(0) | cc_type
1169                     re.prog[pc].cc_index = cc_index
1170                     re.prog[pc].rep_min = 1
1171                     re.prog[pc].rep_max = 1
1172                     pc = pc + 1
1173                     continue
1174                 }
1175                 // cc_class vector memory full
1176                 else if cc_index < 0 {
1177                     return cc_index, i
1178                 }
1179             }
1180         }
1181 
1182         // ist_bsls_char
1183         if char_len == 1 && pc >= 0 {
1184             if u8(char_tmp) == `\\` {
1185                 bsls_index, tmp := re.parse_bsls(in_txt, i)
1186                 // println("index: $bsls_index str:${in_txt[i..i+tmp]}")
1187                 if bsls_index >= 0 {
1188                     i = i + tmp
1189                     re.prog[pc].ist = u32(0) | regex.ist_bsls_char
1190                     re.prog[pc].rep_min = 1
1191                     re.prog[pc].rep_max = 1
1192                     re.prog[pc].validator = regex.bsls_validator_array[bsls_index].validator
1193                     re.prog[pc].ch = regex.bsls_validator_array[bsls_index].ch
1194                     pc = pc + 1
1195                     continue
1196                 }
1197                 // this is an escape char, skip the bsls and continue as a normal char
1198                 else if bsls_index == regex.no_match_found {
1199                     i += char_len
1200                     char_tmp, char_len = re.get_char(in_txt, i)
1201                     // continue as simple char
1202                 }
1203                 // if not an escape or a bsls char then it is an error (at least for now!)
1204                 else {
1205                     return bsls_index, i + tmp
1206                 }
1207             }
1208         }
1209 
1210         // ist_simple_char
1211         re.prog[pc].ist = regex.ist_simple_char
1212         re.prog[pc].ch = char_tmp
1213         re.prog[pc].ch_len = u8(char_len)
1214         re.prog[pc].rep_min = 1
1215         re.prog[pc].rep_max = 1
1216         // println("char: ${char_tmp:c}")
1217         pc = pc + 1
1218 
1219         i += char_len
1220     }
1221 
1222     // add end of the program
1223     re.prog[pc].ist = regex.ist_prog_end
1224     re.prog_len = pc
1225 
1226     // check for unbalanced groups
1227     if group_stack_index != -1 {
1228         return regex.err_group_not_balanced, group_stack_txt_index[group_stack_index] + 1
1229     }
1230 
1231     // check for OR at the end of the program
1232     if pc > 0 && re.prog[pc - 1].ist == regex.ist_or_branch {
1233         return regex.err_syntax_error, in_txt.len - 1
1234     }
1235 
1236     // store the number of groups in the query
1237     re.group_count = group_count + 1
1238 
1239     //******************************************
1240     // Post processing
1241     //******************************************
1242 
1243     //
1244     // manage ist_dot_char
1245     //
1246 
1247     // find the checks for dot chars, if any...
1248     mut pc1 := 0
1249     mut dot_char_count := 0
1250     mut last_dot_char_pc := -1
1251     for pc1 < pc {
1252         if re.prog[pc1].ist == regex.ist_dot_char {
1253             // println("Dot_char pc: $pc1")
1254             last_dot_char_pc = pc1
1255             dot_char_count++
1256             mut pc2 := pc1 + 1
1257             for pc2 < pc {
1258                 // consecutive dot chars is an error
1259                 if re.prog[pc2].ist == regex.ist_dot_char {
1260                     return regex.err_syntax_error, 0
1261                 }
1262                 if re.prog[pc2].ist !in [rune(regex.ist_prog_end), regex.ist_group_end,
1263                     regex.ist_group_start] {
1264                     // println("Next dot char check is PC: ${pc2}")
1265                     re.prog[pc1].dot_check_pc = pc2
1266                     break
1267                 }
1268                 pc2++
1269             }
1270         }
1271         pc1++
1272     }
1273 
1274     // println("last_dot_char_pc: ${last_dot_char_pc}")
1275     if last_dot_char_pc >= 0 {
1276         pc1 = last_dot_char_pc + 1
1277         mut is_last_dot := true
1278         for pc1 < pc {
1279             if re.prog[pc1].ist !in [rune(regex.ist_prog_end), regex.ist_group_end] {
1280                 is_last_dot = false
1281                 break
1282             }
1283             pc1++
1284         }
1285         if is_last_dot {
1286             re.prog[last_dot_char_pc].last_dot_flag = true
1287         }
1288     }
1289 
1290     //
1291     // manage bsls_char
1292     //
1293 
1294     // find the checks for bsls, if any...
1295     pc1 = 0
1296     mut bsls_char_count := 0
1297     mut last_bsls_char_pc := -1
1298     for pc1 < pc {
1299         if re.prog[pc1].ist == regex.ist_bsls_char {
1300             // println("bsls_char pc: $pc1")
1301             last_bsls_char_pc = pc1
1302             bsls_char_count++
1303             mut pc2 := pc1 + 1
1304             for pc2 < pc {
1305                 if re.prog[pc2].ist !in [rune(regex.ist_prog_end), regex.ist_group_end,
1306                     regex.ist_group_start] {
1307                     // println("Next bsls check is PC: ${pc2}")
1308                     re.prog[pc1].bsls_check_pc = pc2
1309                     break
1310                 }
1311                 pc2++
1312             }
1313         }
1314         pc1++
1315     }
1316 
1317     // println('last_bsls_char_pc: ${last_bsls_char_pc}')
1318     if last_bsls_char_pc >= 0 {
1319         pc1 = last_bsls_char_pc + 1
1320         mut is_last_bsls := true
1321         for pc1 < pc {
1322             if re.prog[pc1].ist !in [rune(regex.ist_prog_end), regex.ist_group_end] {
1323                 is_last_bsls = false
1324                 break
1325             }
1326             pc1++
1327         }
1328         if is_last_bsls {
1329             re.prog[last_bsls_char_pc].last_dot_flag = true
1330         }
1331     }
1332 
1333     //
1334     // manage CC
1335     //
1336     pc1 = 0
1337     mut cc_char_count := 0
1338     mut last_cc_char_pc := -1
1339     for pc1 < pc {
1340         if re.prog[pc1].ist in [rune(regex.ist_char_class_pos), regex.ist_char_class_neg] {
1341             last_cc_char_pc = pc1
1342             cc_char_count++
1343             mut pc2 := pc1 + 1
1344             for pc2 < pc {
1345                 if re.prog[pc2].ist !in [rune(regex.ist_prog_end), regex.ist_group_end,
1346                     regex.ist_group_start] {
1347                     // println("Next CC check is PC: ${pc2}")
1348                     re.prog[pc1].cc_check_pc = pc2
1349                     break
1350                 }
1351                 pc2++
1352             }
1353         }
1354         pc1++
1355     }
1356 
1357     // println('last_cc_char_pc: ${last_cc_char_pc}')
1358     if last_cc_char_pc >= 0 {
1359         pc1 = last_cc_char_pc + 1
1360         mut is_last_cc := true
1361         for pc1 < pc {
1362             if re.prog[pc1].ist !in [rune(regex.ist_prog_end), regex.ist_group_end] {
1363                 is_last_cc = false
1364                 break
1365             }
1366             pc1++
1367         }
1368         if is_last_cc {
1369             re.prog[last_cc_char_pc].last_dot_flag = true
1370         }
1371     }
1372 
1373     //******************************************
1374 
1375     // OR branch
1376     // a|b|cd
1377     // d exit point
1378     // a,b,c branches
1379     // set the jump in the right places
1380     pc1 = 0
1381     for pc1 < pc - 2 {
1382         // println("Here $pc1 ${pc-2}")
1383         // println("source index: ${pc1 + 1} => ${re.prog[pc1+1].source_index}")
1384         if re.prog[pc1 + 1].ist == regex.ist_or_branch {
1385             // two consecutive OR are a syntax error
1386             if re.prog[pc1 + 2].ist == regex.ist_or_branch {
1387                 return regex.err_syntax_error, i
1388             }
1389 
1390             // check for []|[] errors
1391             if re.prog[pc1].ist == regex.ist_char_class_pos
1392                 && re.prog[pc1 + 2].ist == regex.ist_char_class_pos {
1393                 return regex.err_invalid_or_with_cc, re.prog[pc1 + 1].source_index
1394             }
1395         }
1396 
1397         // manange a|b chains like a|(b)|c|d...
1398         // standard solution
1399         if re.prog[pc1].ist != regex.ist_or_branch && re.prog[pc1 + 1].ist == regex.ist_or_branch
1400             && re.prog[pc1 + 2].ist != regex.ist_or_branch {
1401             re.prog[pc1].next_is_or = true // set that the next token is an  OR
1402             re.prog[pc1 + 1].rep_min = pc1 + 2 // failed match jump
1403 
1404             // match jump, if an OR chain the next token will be an OR token
1405             mut pc2 := pc1 + 2
1406             for pc2 < pc - 1 {
1407                 ist := re.prog[pc2].ist
1408                 if ist == regex.ist_group_start {
1409                     re.prog[pc1 + 1].rep_max = re.prog[pc2].goto_pc + 1
1410                     break
1411                 }
1412                 if ist != regex.ist_or_branch {
1413                     re.prog[pc1 + 1].rep_max = pc2 + 1
1414                     break
1415                 }
1416 
1417                 pc2++
1418             }
1419             // special case query of few chars, the true can't go on the first instruction
1420             if re.prog[pc1 + 1].rep_max == pc1 {
1421                 re.prog[pc1 + 1].rep_max = 3
1422             }
1423             // println("Compile OR postproc. [$pc1,OR ${pc1+1},$pc2]")
1424             pc1 = pc2
1425             continue
1426         }
1427 
1428         pc1++
1429     }
1430 
1431     //******************************************
1432     // DEBUG PRINT REGEX GENERATED CODE
1433     //******************************************
1434     if re.debug > 0 {
1435         gc := re.get_code()
1436         re.log_func(gc)
1437     }
1438     //******************************************
1439 
1440     return regex.compile_ok, 0
1441 }
1442 
1443 // get_code return the compiled code as regex string, note: may be different from the source!
1444 pub fn (re RE) get_code() string {
1445     mut pc1 := 0
1446     mut res := strings.new_builder(re.cc.len * 2 * re.prog.len)
1447     res.write_string('========================================\nv RegEx compiler v ${regex.v_regex_version} output:\n')
1448 
1449     mut stop_flag := false
1450 
1451     for pc1 <= re.prog.len {
1452         tk := re.prog[pc1]
1453         res.write_string('PC:${pc1:3d}')
1454 
1455         res.write_string(' ist: ')
1456         res.write_string('${tk.ist:8x}'.replace(' ', '0'))
1457         res.write_string(' ')
1458         ist := tk.ist
1459         if ist == regex.ist_bsls_char {
1460             res.write_string('[\\${tk.ch:1c}]     BSLS')
1461             if tk.last_dot_flag == true {
1462                 res.write_string(' last!')
1463             }
1464         } else if ist == regex.ist_prog_end {
1465             res.write_string('PROG_END')
1466             stop_flag = true
1467         } else if ist == regex.ist_or_branch {
1468             res.write_string('OR      ')
1469         } else if ist == regex.ist_char_class_pos {
1470             res.write_string('[${re.get_char_class(pc1)}]     CHAR_CLASS_POS')
1471             if tk.last_dot_flag == true {
1472                 res.write_string(' last!')
1473             }
1474         } else if ist == regex.ist_char_class_neg {
1475             res.write_string('[^${re.get_char_class(pc1)}]    CHAR_CLASS_NEG')
1476             if tk.last_dot_flag == true {
1477                 res.write_string(' last!')
1478             }
1479         } else if ist == regex.ist_dot_char {
1480             res.write_string('.        DOT_CHAR nx chk: ${tk.dot_check_pc}')
1481             if tk.last_dot_flag == true {
1482                 res.write_string(' last!')
1483             }
1484         } else if ist == regex.ist_group_start {
1485             res.write_string('(        GROUP_START #:${tk.group_id}')
1486             if tk.group_id == -1 {
1487                 res.write_string(' ?:')
1488             } else {
1489                 for x in re.group_map.keys() {
1490                     if re.group_map[x] == (tk.group_id + 1) {
1491                         res.write_string(' ?P<${x}>')
1492                         break
1493                     }
1494                 }
1495             }
1496         } else if ist == regex.ist_group_end {
1497             res.write_string(')        GROUP_END   #:${tk.group_id}')
1498         } else if ist == regex.ist_simple_char {
1499             res.write_string('[${tk.ch:1c}]      query_ch')
1500         }
1501 
1502         if tk.rep_max == regex.max_quantifier {
1503             res.write_string(' {${tk.rep_min:3d},MAX}')
1504         } else {
1505             if ist == regex.ist_or_branch {
1506                 res.write_string(' if false go: ${tk.rep_min:3d} if true go: ${tk.rep_max:3d}')
1507             } else {
1508                 res.write_string(' {${tk.rep_min:3d},${tk.rep_max:3d}}')
1509             }
1510             if tk.greedy == true {
1511                 res.write_string('?')
1512             }
1513         }
1514 
1515         res.write_string('\n')
1516         if stop_flag {
1517             break
1518         }
1519         pc1++
1520     }
1521 
1522     res.write_string('========================================\n')
1523     return res.str()
1524 }
1525 
1526 // get_query return a string with a reconstruction of the query starting from the regex program code
1527 pub fn (re RE) get_query() string {
1528     mut res := strings.new_builder(re.query.len * 2)
1529 
1530     if (re.flag & regex.f_ms) != 0 {
1531         res.write_string('^')
1532     }
1533 
1534     mut i := 0
1535     for i < re.prog.len && re.prog[i].ist != regex.ist_prog_end && re.prog[i].ist != 0 {
1536         tk := unsafe { &re.prog[i] }
1537         ch := tk.ist
1538 
1539         // GROUP start
1540         if ch == regex.ist_group_start {
1541             if re.debug > 0 {
1542                 res.write_string('#${tk.group_id}')
1543             }
1544             res.write_string('(')
1545 
1546             if tk.group_neg == true {
1547                 res.write_string('?!') // negation group
1548             } else if tk.group_id == -1 {
1549                 res.write_string('?:') // non capturing group
1550             }
1551 
1552             for x in re.group_map.keys() {
1553                 if re.group_map[x] == (tk.group_id + 1) {
1554                     res.write_string('?P<${x}>')
1555                     break
1556                 }
1557             }
1558 
1559             i++
1560             continue
1561         }
1562 
1563         // GROUP end
1564         if ch == regex.ist_group_end {
1565             res.write_string(')')
1566         }
1567 
1568         // OR branch
1569         if ch == regex.ist_or_branch {
1570             res.write_string('|')
1571             if re.debug > 0 {
1572                 res.write_string('{${tk.rep_min},${tk.rep_max}}')
1573             }
1574             i++
1575             continue
1576         }
1577 
1578         // char class
1579         if ch == regex.ist_char_class_neg || ch == regex.ist_char_class_pos {
1580             res.write_string('[')
1581             if ch == regex.ist_char_class_neg {
1582                 res.write_string('^')
1583             }
1584             res.write_string('${re.get_char_class(i)}')
1585             res.write_string(']')
1586         }
1587 
1588         // bsls char
1589         if ch == regex.ist_bsls_char {
1590             res.write_string('\\${tk.ch:1c}')
1591         }
1592 
1593         // ist_dot_char
1594         if ch == regex.ist_dot_char {
1595             res.write_string('.')
1596         }
1597 
1598         // char alone
1599         if ch == regex.ist_simple_char {
1600             if u8(ch) in regex.bsls_escape_list {
1601                 res.write_string('\\')
1602             }
1603             res.write_string('${tk.ch:c}')
1604         }
1605 
1606         // quantifier
1607         if !(tk.rep_min == 1 && tk.rep_max == 1) && tk.group_neg == false {
1608             if tk.rep_min == 0 && tk.rep_max == 1 {
1609                 res.write_string('?')
1610             } else if tk.rep_min == 1 && tk.rep_max == regex.max_quantifier {
1611                 res.write_string('+')
1612             } else if tk.rep_min == 0 && tk.rep_max == regex.max_quantifier {
1613                 res.write_string('*')
1614             } else {
1615                 if tk.rep_max == regex.max_quantifier {
1616                     res.write_string('{${tk.rep_min},MAX}')
1617                 } else {
1618                     res.write_string('{${tk.rep_min},${tk.rep_max}}')
1619                 }
1620                 if tk.greedy == true {
1621                     res.write_string('?')
1622                 }
1623             }
1624         }
1625         i++
1626     }
1627     if (re.flag & regex.f_me) != 0 {
1628         res.write_string('$')
1629     }
1630 
1631     return res.str()
1632 }
1633 
1634 /******************************************************************************
1635 *
1636 * Groups saving utilities
1637 *
1638 ******************************************************************************/
1639 [direct_array_access]
1640 fn (mut re RE) group_continuous_save(g_index int) {
1641     if re.group_csave_flag == true {
1642         // continuous save, save until we have space
1643 
1644         // init the first element as counter
1645         if re.group_csave.len == 0 {
1646             re.group_csave << 0
1647         }
1648 
1649         gi := g_index >> 1
1650         start := re.groups[g_index]
1651         end := re.groups[g_index + 1]
1652 
1653         // check if we are simply increasing the size ot the found group
1654         if re.group_csave.len >= 4 && gi == re.group_csave[re.group_csave.len - 3]
1655             && start == re.group_csave[re.group_csave.len - 2] {
1656             re.group_csave[re.group_csave.len - 1] = end
1657             return
1658         }
1659 
1660         // otherwise append a new group to the list
1661 
1662         // increment counter
1663         re.group_csave[0]++
1664         // save the record
1665         re.group_csave << (g_index >> 1) // group id
1666         re.group_csave << re.groups[g_index] // start
1667         re.group_csave << re.groups[g_index + 1] // end
1668     }
1669 }
1670 
1671 /******************************************************************************
1672 *
1673 * Matching
1674 *
1675 ******************************************************************************/
1676 enum Match_state {
1677     start        = 0
1678     stop
1679     end
1680     new_line
1681     ist_load // load and execute instruction
1682     ist_next // go to next instruction
1683     ist_next_ks // go to next instruction without clenaning the state
1684     ist_quant_p // match positive ,quantifier check
1685     ist_quant_n // match negative, quantifier check
1686     ist_quant_pg // match positive ,group quantifier check
1687     ist_quant_ng // match negative ,group quantifier check
1688 }
1689 
1690 fn state_str(s Match_state) string {
1691     match s {
1692         .start { return 'start' }
1693         .stop { return 'stop' }
1694         .end { return 'end' }
1695         .new_line { return 'new line' }
1696         .ist_load { return 'ist_load' }
1697         .ist_next { return 'ist_next' }
1698         .ist_next_ks { return 'ist_next_ks' }
1699         .ist_quant_p { return 'ist_quant_p' }
1700         .ist_quant_n { return 'ist_quant_n' }
1701         .ist_quant_pg { return 'ist_quant_pg' }
1702         .ist_quant_ng { return 'ist_quant_ng' }
1703     }
1704 }
1705 
1706 struct StateObj {
1707 pub mut:
1708     group_index int = -1 // group id used to know how many groups are open
1709     match_flag  bool // indicate if we are in a match condition
1710     match_index int = -1 // index of the last match
1711     first_match int = -1 // index of the first match
1712     pc          int = -1 // program counter
1713     i           int = -1 // source string index
1714     char_len    int  // last char legth
1715     last_dot_pc int = -1 // last dot chat pc
1716 }
1717 
1718 [direct_array_access]
1719 pub fn (mut re RE) match_base(in_txt &u8, in_txt_len int) (int, int) {
1720     // result status
1721     mut result := regex.no_match_found // function return
1722 
1723     mut ch := rune(0) // examinated char
1724     mut char_len := 0 // utf8 examinated char len
1725     mut m_state := Match_state.start // start point for the matcher FSM
1726     mut src_end := false
1727     mut last_fnd_pc := -1
1728 
1729     mut state := StateObj{} // actual state
1730     mut ist := rune(0) // actual instruction
1731     mut l_ist := rune(0) // last matched instruction
1732 
1733     mut step_count := 0 // stats for debug
1734     mut dbg_line := 0 // count debug line printed
1735 
1736     re.reset()
1737 
1738     if re.debug > 0 {
1739         // print header
1740         mut h_buf := strings.new_builder(32)
1741         h_buf.write_string('flags: ')
1742         h_buf.write_string('${re.flag:8x}'.replace(' ', '0'))
1743         h_buf.write_string('\n')
1744         sss := h_buf.str()
1745         re.log_func(sss)
1746     }
1747 
1748     for m_state != .end {
1749         if state.pc >= 0 && state.pc < re.prog.len {
1750             ist = re.prog[state.pc].ist
1751         } else if state.pc >= re.prog.len {
1752             // println("ERROR!! PC overflow!!")
1753             return regex.err_internal_error, state.i
1754         }
1755 
1756         //******************************************
1757         // DEBUG LOG
1758         //******************************************
1759         if re.debug > 0 {
1760             mut buf2 := strings.new_builder(re.cc.len + 128)
1761 
1762             // print all the instructions
1763 
1764             // end of the input text
1765             if state.i >= in_txt_len {
1766                 buf2.write_string('# ${step_count:3d} END OF INPUT TEXT\n')
1767                 sss := buf2.str()
1768                 re.log_func(sss)
1769             } else {
1770                 // print only the exe instruction
1771                 if (re.debug == 1 && m_state == .ist_load) || re.debug == 2 {
1772                     if ist == regex.ist_prog_end {
1773                         buf2.write_string('# ${step_count:3d} PROG_END\n')
1774                     } else if ist == 0 || m_state in [.start, .ist_next, .stop] {
1775                         buf2.write_string('# ${step_count:3d} s: ${state_str(m_state):12s} PC: NA\n')
1776                     } else {
1777                         ch, char_len = re.get_charb(in_txt, state.i)
1778 
1779                         buf2.write_string('# ${step_count:3d} s: ${state_str(m_state):12s} PC: ${state.pc:3d}=>')
1780                         buf2.write_string('${ist:8x}'.replace(' ', '0'))
1781                         buf2.write_string(" i,ch,len:[${state.i:3d},'${utf8_str(ch)}',${char_len}] f.m:[${state.first_match:3d},${state.match_index:3d}] ")
1782 
1783                         if ist == regex.ist_simple_char {
1784                             buf2.write_string('query_ch: [${re.prog[state.pc].ch:1c}]')
1785                         } else {
1786                             if ist == regex.ist_bsls_char {
1787                                 buf2.write_string('BSLS [\\${re.prog[state.pc].ch:1c}]')
1788                             } else if ist == regex.ist_prog_end {
1789                                 buf2.write_string('PROG_END')
1790                             } else if ist == regex.ist_or_branch {
1791                                 buf2.write_string('OR')
1792                             } else if ist == regex.ist_char_class_pos {
1793                                 buf2.write_string('CHAR_CLASS_POS[${re.get_char_class(state.pc)}]')
1794                             } else if ist == regex.ist_char_class_neg {
1795                                 buf2.write_string('CHAR_CLASS_NEG[${re.get_char_class(state.pc)}]')
1796                             } else if ist == regex.ist_dot_char {
1797                                 buf2.write_string('DOT_CHAR')
1798                             } else if ist == regex.ist_group_start {
1799                                 tmp_gi := re.prog[state.pc].group_id
1800                                 tmp_gr := re.prog[re.prog[state.pc].goto_pc].group_rep
1801                                 buf2.write_string('GROUP_START #:${tmp_gi} rep:${tmp_gr} ')
1802                             } else if ist == regex.ist_group_end {
1803                                 buf2.write_string('GROUP_END   #:${re.prog[state.pc].group_id} deep:${state.group_index}')
1804                             }
1805                         }
1806                         if re.prog[state.pc].rep_max == regex.max_quantifier {
1807                             buf2.write_string('{${re.prog[state.pc].rep_min},MAX}:${re.prog[state.pc].rep}')
1808                         } else {
1809                             buf2.write_string('{${re.prog[state.pc].rep_min},${re.prog[state.pc].rep_max}}:${re.prog[state.pc].rep}')
1810                         }
1811                         if re.prog[state.pc].greedy == true {
1812                             buf2.write_string('?')
1813                         }
1814                         buf2.write_string(' (#${state.group_index})')
1815 
1816                         if ist == regex.ist_dot_char {
1817                             buf2.write_string(' last!')
1818                         }
1819 
1820                         buf2.write_string('\n')
1821                     }
1822                     sss2 := buf2.str()
1823                     re.log_func(sss2)
1824                 }
1825             }
1826             step_count++
1827             dbg_line++
1828         }
1829         //******************************************
1830 
1831         if ist == regex.ist_prog_end {
1832             // println("HERE we end!")
1833             break
1834         }
1835 
1836         // we're out of text, manage it
1837         if state.i >= in_txt_len || m_state == .new_line {
1838             // println("Finished text!!")
1839             src_end = true
1840 
1841             // we have fished the text, we must manage out pf bound indexes
1842             if state.i >= in_txt_len {
1843                 state.i = in_txt_len - 1
1844             }
1845 
1846             // manage groups
1847             if state.group_index >= 0 && state.match_index >= 0 {
1848                 // println("End text with open groups!")
1849                 // println("state.group_index: ${state.group_index}")
1850                 // close the groups
1851                 for state.group_index >= 0 {
1852                     tmp_pc := re.group_data[state.group_index]
1853                     re.prog[tmp_pc].group_rep++
1854                     // println("Closing group $state.group_index {${re.prog[tmp_pc].rep_min},${re.prog[tmp_pc].rep_max}}:${re.prog[tmp_pc].group_rep}")
1855 
1856                     if re.prog[tmp_pc].group_rep >= re.prog[tmp_pc].rep_min
1857                         && re.prog[tmp_pc].group_id >= 0 {
1858                         start_i := re.group_stack[state.group_index]
1859                         re.group_stack[state.group_index] = -1
1860 
1861                         // save group results
1862                         g_index := re.prog[tmp_pc].group_id * 2
1863                         // println("group_id: ${re.prog[tmp_pc].group_id} g_index: ${g_index}")
1864                         if start_i >= 0 {
1865                             re.groups[g_index] = start_i
1866                         } else {
1867                             re.groups[g_index] = 0
1868                         }
1869 
1870                         re.groups[g_index + 1] = state.i
1871 
1872                         if re.groups[g_index + 1] >= in_txt_len {
1873                             // println("clamp group on stop!")
1874                             re.groups[g_index + 1] = in_txt_len - 1
1875                         }
1876 
1877                         // continuous save, save until we have space
1878                         re.group_continuous_save(g_index)
1879                     }
1880                     state.group_index--
1881                 }
1882             }
1883 
1884             // println("re.groups: ${re.groups}")
1885 
1886             // the text is finished and the groups closed and we are the last group, ok exit
1887             if ist == regex.ist_group_end && re.prog[state.pc + 1].ist == regex.ist_prog_end {
1888                 // println("Last group end")
1889                 return state.first_match, state.i
1890             }
1891 
1892             if state.pc == -1 {
1893                 state.pc = last_fnd_pc
1894             }
1895 
1896             // println("Finished text!!")
1897             // println("Instruction: ${ist:08x} pc: $state.pc")
1898             // println("min_rep: ${re.prog[state.pc].rep_min} max_rep: ${re.prog[state.pc].rep_max} rep: ${re.prog[state.pc].rep}")
1899 
1900             // program end
1901             if ist == regex.ist_prog_end {
1902                 // println("Program end on end of text!")
1903                 return state.first_match, state.i
1904             }
1905 
1906             if l_ist in [
1907                 rune(regex.ist_char_class_neg),
1908                 regex.ist_char_class_pos,
1909                 regex.ist_bsls_char,
1910                 regex.ist_dot_char,
1911             ] {
1912                 // println("***** We have a last special token")
1913                 // println("PC: ${state.pc} last_dot_flag:${re.prog[state.pc].last_dot_flag}")
1914                 // println("rep: ${re.prog[state.pc].group_rep} min: ${re.prog[state.pc].rep_min} max: ${re.prog[state.pc].rep_max}")
1915                 // println("first match: ${state.first_match}")
1916 
1917                 if re.prog[state.pc].last_dot_flag == true
1918                     && re.prog[state.pc].rep >= re.prog[state.pc].rep_min
1919                     && re.prog[state.pc].rep <= re.prog[state.pc].rep_max {
1920                     return state.first_match, state.i
1921                 }
1922                 // println("Not fitted!!")
1923             }
1924             // no groups open, check the last token quantifier
1925             if ist != regex.ist_group_end && re.prog[state.pc + 1].ist == regex.ist_prog_end {
1926                 if re.prog[state.pc].rep >= re.prog[state.pc].rep_min
1927                     && re.prog[state.pc].rep <= re.prog[state.pc].rep_max {
1928                     // println("We are in good repetition")
1929                     return state.first_match, state.i
1930                 }
1931             }
1932 
1933             // println("No good exit!!")
1934             if re.prog[re.prog_len - 1].ist == regex.ist_group_end {
1935                 // println("last ist is a group end!")
1936                 if re.prog[re.prog_len - 1].group_rep >= re.prog[re.prog_len - 1].rep_min {
1937                     return state.first_match, state.i
1938                 }
1939             }
1940             return regex.no_match_found, state.i
1941         }
1942 
1943         // starting and init
1944         if m_state == .start {
1945             state.pc = -1
1946             state.i = 0
1947             m_state = .ist_next
1948             continue
1949         }
1950         // ist_next, next instruction reseting its state
1951         else if m_state == .ist_next {
1952             state.pc = state.pc + 1
1953             re.prog[state.pc].reset()
1954             // check if we are in the program bounds
1955             if state.pc < 0 || state.pc > re.prog.len {
1956                 // println("ERROR!! PC overflow!!")
1957                 return regex.err_internal_error, state.i
1958             }
1959             m_state = .ist_load
1960             continue
1961         }
1962         // ist_next_ks, next instruction keeping its state
1963         else if m_state == .ist_next_ks {
1964             state.pc = state.pc + 1
1965             // check if we are in the program bounds
1966             if state.pc < 0 || state.pc > re.prog.len {
1967                 // println("ERROR!! PC overflow!!")
1968                 return regex.err_internal_error, state.i
1969             }
1970             m_state = .ist_load
1971             continue
1972         }
1973 
1974         // load the char
1975         ch, char_len = re.get_charb(in_txt, state.i)
1976 
1977         // check new line if flag f_nl enabled
1978         if (re.flag & regex.f_nl) != 0 && char_len == 1 && u8(ch) in regex.new_line_list {
1979             m_state = .new_line
1980             continue
1981         }
1982         // check if stop
1983         else if m_state == .stop {
1984             // we are in search mode, don't exit until the end
1985             if (re.flag & regex.f_src) != 0 && ist != regex.ist_prog_end {
1986                 last_fnd_pc = state.pc
1987                 state.pc = -1
1988                 state.i += char_len
1989 
1990                 m_state = .ist_next
1991                 re.reset_src()
1992                 state.match_index = -1
1993                 state.first_match = -1
1994 
1995                 // reset state list
1996                 re.reset()
1997 
1998                 continue
1999             }
2000 
2001             if ist == regex.ist_prog_end {
2002                 return state.first_match, state.i
2003             }
2004 
2005             // manage here dot char
2006 
2007             if re.state_list.len > 0 {
2008                 // println("Here we are, with stop: state buffer: [${re.state_list.len}]")
2009                 state = re.state_list.pop()
2010 
2011                 state.match_flag = true
2012                 l_ist = u32(regex.ist_dot_char)
2013 
2014                 if state.first_match < 0 {
2015                     state.first_match = state.i
2016                 }
2017                 state.match_index = state.i
2018                 re.prog[state.pc].rep++ // increase repetitions
2019 
2020                 state.i += char_len
2021                 m_state = .ist_quant_p
2022                 continue
2023             }
2024 
2025             // exit on no match
2026             return result, state.i
2027         }
2028         // ist_load
2029         else if m_state == .ist_load {
2030             // program end
2031             if ist == regex.ist_prog_end {
2032                 // if we are in match exit well
2033 
2034                 if state.group_index >= 0 && state.match_index >= 0 {
2035                     state.group_index = -1
2036                 }
2037 
2038                 m_state = .stop
2039                 continue
2040             }
2041             // check GROUP start, no quantifier is checkd for this token!!
2042             else if ist == regex.ist_group_start {
2043                 state.group_index++
2044                 re.group_data[state.group_index] = re.prog[state.pc].goto_pc // save where is ist_group_end, we will use it for escape
2045                 re.group_stack[state.group_index] = state.i // index where we start to manage
2046                 // println("group_index $state.group_index rep ${re.prog[re.prog[state.pc].goto_pc].group_rep}")
2047 
2048                 m_state = .ist_next
2049                 continue
2050             }
2051             // check GROUP end
2052             else if ist == regex.ist_group_end {
2053                 // we are in matching streak
2054                 // println("Group END!! last ist: ${l_ist:08x}")
2055                 if state.match_index >= 0 {
2056                     // restore txt index stack and save the group data
2057 
2058                     // println("g.id: ${re.prog[state.pc].group_id} group_index: ${state.group_index}")
2059                     if state.group_index >= 0 && re.prog[state.pc].group_id >= 0 {
2060                         start_i := re.group_stack[state.group_index]
2061 
2062                         // save group results
2063                         g_index := re.prog[state.pc].group_id * 2
2064 
2065                         if start_i >= 0 {
2066                             re.groups[g_index] = start_i
2067                         } else {
2068                             re.groups[g_index] = 0
2069                         }
2070 
2071                         re.groups[g_index + 1] = state.i
2072 
2073                         if g_index > 0 && re.groups[g_index] <= re.groups[g_index - 1] {
2074                             re.groups[g_index] = re.groups[g_index - 1]
2075                         }
2076 
2077                         if re.groups[g_index + 1] >= in_txt_len {
2078                             // println("clamp group!")
2079                             re.groups[g_index + 1] = in_txt_len - 1
2080                         }
2081 
2082                         // println("GROUP ${re.prog[state.pc].group_id} END [${re.groups[g_index]}, ${re.groups[g_index+1]}] i: $state.i in_txt_len: $in_txt_len")
2083 
2084                         // continuous save, save until we have space
2085                         re.group_continuous_save(g_index)
2086                     }
2087 
2088                     re.prog[state.pc].group_rep++ // increase repetitions
2089                     // println("GROUP $group_index END ${re.prog[state.pc].group_rep}")
2090                     if re.prog[state.pc].group_rep > in_txt_len - 1 {
2091                         m_state = .ist_quant_ng
2092                         continue
2093                     }
2094 
2095                     m_state = .ist_quant_pg
2096                     continue
2097                 }
2098 
2099                 m_state = .ist_quant_ng
2100                 continue
2101             }
2102             // check OR
2103             else if ist == regex.ist_or_branch {
2104                 if state.match_index >= 0 {
2105                     state.pc = re.prog[state.pc].rep_max
2106                     // println("ist_or_branch True pc: $state.pc")
2107                 } else {
2108                     state.pc = re.prog[state.pc].rep_min
2109                     // println("ist_or_branch False pc: $state.pc")
2110                 }
2111                 re.prog[state.pc].reset()
2112                 m_state = .ist_load
2113                 continue
2114             }
2115             // check ist_dot_char
2116             else if ist == regex.ist_dot_char {
2117                 // println("ist_dot_char rep: ${re.prog[state.pc].rep}")
2118 
2119                 // check next token to be false
2120                 mut next_check_flag := false
2121 
2122                 // if we are done with max go on dot char are dedicated case!!
2123                 if re.prog[state.pc].rep >= re.prog[state.pc].rep_max {
2124                     re.state_list.pop()
2125                     m_state = .ist_next
2126                     continue
2127                 }
2128 
2129                 if re.prog[state.pc].last_dot_flag == false && re.prog[state.pc].dot_check_pc >= 0
2130                     && re.prog[state.pc].rep >= re.prog[state.pc].rep_min {
2131                     // load the char
2132                     // ch_t, _ := re.get_charb(in_txt, state.i+char_len)
2133                     ch_t := ch
2134                     chk_pc := re.prog[state.pc].dot_check_pc
2135 
2136                     // simple char
2137                     if re.prog[chk_pc].ist == regex.ist_simple_char {
2138                         if re.prog[chk_pc].ch == ch_t {
2139                             next_check_flag = true
2140                         }
2141                         // println("Check [ist_simple_char] [${re.prog[chk_pc].ch}]==[${ch_t:c}] => $next_check_flag")
2142                     }
2143                     // char char_class
2144                     else if re.prog[chk_pc].ist == regex.ist_char_class_pos
2145                         || re.prog[chk_pc].ist == regex.ist_char_class_neg {
2146                         mut cc_neg := false
2147                         if re.prog[chk_pc].ist == regex.ist_char_class_neg {
2148                             cc_neg = true
2149                         }
2150                         mut cc_res := re.check_char_class(chk_pc, ch_t)
2151 
2152                         if cc_neg {
2153                             cc_res = !cc_res
2154                         }
2155                         next_check_flag = cc_res
2156                         // println("Check [ist_char_class] => $next_check_flag")
2157                     }
2158                     // check bsls
2159                     else if re.prog[chk_pc].ist == regex.ist_bsls_char {
2160                         next_check_flag = re.prog[chk_pc].validator(u8(ch_t))
2161                         // println("Check [ist_bsls_char] => $next_check_flag")
2162                     }
2163                 }
2164 
2165                 // check if we must continue or pass to the next IST
2166                 if next_check_flag == true && re.prog[state.pc + 1].ist != regex.ist_prog_end {
2167                     // println("save the state!!")
2168                     mut dot_state := StateObj{
2169                         group_index: state.group_index
2170                         match_flag: state.match_flag
2171                         match_index: state.match_index
2172                         first_match: state.first_match
2173                         pc: state.pc
2174                         i: state.i + char_len
2175                         char_len: char_len
2176                         last_dot_pc: state.pc
2177                     }
2178                     // if we are mananging a .* stay on the same char on return
2179                     if re.prog[state.pc].rep_min == 0 {
2180                         dot_state.i -= char_len
2181                     }
2182 
2183                     re.state_list << dot_state
2184 
2185                     m_state = .ist_quant_n
2186                     // println("dot_char stack len: ${re.state_list.len}")
2187                     continue
2188                 }
2189 
2190                 state.match_flag = true
2191                 l_ist = u32(regex.ist_dot_char)
2192 
2193                 if state.first_match < 0 {
2194                     state.first_match = state.i
2195                 }
2196                 state.match_index = state.i
2197                 re.prog[state.pc].rep++ // increase repetitions
2198 
2199                 state.i += char_len
2200                 m_state = .ist_quant_p
2201                 continue
2202             }
2203             // char class IST
2204             else if ist == regex.ist_char_class_pos || ist == regex.ist_char_class_neg {
2205                 // check next token to be false
2206                 mut next_check_flag := false
2207 
2208                 // if we are done with max go on dot char are dedicated case!!
2209                 if re.prog[state.pc].rep >= re.prog[state.pc].rep_max {
2210                     re.state_list.pop()
2211                     m_state = .ist_next
2212                     continue
2213                 }
2214 
2215                 if re.prog[state.pc].last_dot_flag == false && re.prog[state.pc].cc_check_pc >= 0
2216                     && re.prog[state.pc].rep >= re.prog[state.pc].rep_min {
2217                     // load the char
2218                     // ch_t, _ := re.get_charb(in_txt, state.i+char_len)
2219                     ch_t := ch
2220                     chk_pc := re.prog[state.pc].cc_check_pc
2221 
2222                     // simple char
2223                     if re.prog[chk_pc].ist == regex.ist_simple_char {
2224                         if re.prog[chk_pc].ch == ch_t {
2225                             next_check_flag = true
2226                         }
2227                         // println("Check [ist_simple_char] [${re.prog[chk_pc].ch}]==[${ch_t:c}] => $next_check_flag")
2228                     }
2229                     // char char_class
2230                     else if re.prog[chk_pc].ist == regex.ist_char_class_pos
2231                         || re.prog[chk_pc].ist == regex.ist_char_class_neg {
2232                         mut cc_neg := false
2233                         if re.prog[chk_pc].ist == regex.ist_char_class_neg {
2234                             cc_neg = true
2235                         }
2236                         mut cc_res := re.check_char_class(chk_pc, ch_t)
2237 
2238                         if cc_neg {
2239                             cc_res = !cc_res
2240                         }
2241                         next_check_flag = cc_res
2242                         // println("Check [ist_char_class] => $next_check_flag")
2243                     }
2244                     // check bsls
2245                     else if re.prog[chk_pc].ist == regex.ist_bsls_char {
2246                         next_check_flag = re.prog[chk_pc].validator(u8(ch_t))
2247                         // println("Check [ist_bsls_char] => $next_check_flag")
2248                     }
2249                 }
2250 
2251                 // check if we must continue or pass to the next IST
2252                 if next_check_flag == true && re.prog[state.pc + 1].ist != regex.ist_prog_end {
2253                     // println("save the state!!")
2254                     mut dot_state := StateObj{
2255                         group_index: state.group_index
2256                         match_flag: state.match_flag
2257                         match_index: state.match_index
2258                         first_match: state.first_match
2259                         pc: state.pc
2260                         i: state.i + char_len
2261                         char_len: char_len
2262                         last_dot_pc: state.pc
2263                     }
2264                     // if we are managing a \[something]* stay on the same char on return
2265                     if re.prog[state.pc].rep_min == 0 {
2266                         dot_state.i -= char_len
2267                     }
2268 
2269                     re.state_list << dot_state
2270 
2271                     m_state = .ist_quant_n
2272                     // println("dot_char stack len: ${re.state_list.len}")
2273                     continue
2274                 }
2275 
2276                 state.match_flag = false
2277                 mut cc_neg := false
2278 
2279                 if ist == regex.ist_char_class_neg {
2280                     cc_neg = true
2281                 }
2282                 mut cc_res := re.check_char_class(state.pc, ch)
2283 
2284                 if cc_neg {
2285                     cc_res = !cc_res
2286                 }
2287 
2288                 if cc_res {
2289                     state.match_flag = true
2290                     l_ist = u32(regex.ist_char_class_pos)
2291 
2292                     if state.first_match < 0 {
2293                         state.first_match = state.i
2294                     }
2295 
2296                     state.match_index = state.i
2297 
2298                     re.prog[state.pc].rep++ // increase repetitions
2299                     state.i += char_len // next char
2300                     m_state = .ist_quant_p
2301                     continue
2302                 }
2303                 m_state = .ist_quant_n
2304                 continue
2305             }
2306             // check bsls
2307             else if ist == regex.ist_bsls_char {
2308                 // println("ist_bsls_char rep: ${re.prog[state.pc].rep}")
2309 
2310                 // check next token to be false
2311                 mut next_check_flag := false
2312 
2313                 // if we are done with max go on dot char are dedicated case!!
2314                 if re.prog[state.pc].rep >= re.prog[state.pc].rep_max {
2315                     re.state_list.pop()
2316                     m_state = .ist_next
2317                     continue
2318                 }
2319 
2320                 if re.prog[state.pc].last_dot_flag == false && re.prog[state.pc].bsls_check_pc >= 0
2321                     && re.prog[state.pc].rep >= re.prog[state.pc].rep_min {
2322                     // load the char
2323                     // ch_t, _ := re.get_charb(in_txt, state.i+char_len)
2324                     ch_t := ch
2325                     chk_pc := re.prog[state.pc].bsls_check_pc
2326 
2327                     // simple char
2328                     if re.prog[chk_pc].ist == regex.ist_simple_char {
2329                         if re.prog[chk_pc].ch == ch_t {
2330                             next_check_flag = true
2331                         }
2332                         // println("Check [ist_simple_char] [${re.prog[chk_pc].ch}]==[${ch_t:c}] => $next_check_flag")
2333                     }
2334                     // char char_class
2335                     else if re.prog[chk_pc].ist == regex.ist_char_class_pos
2336                         || re.prog[chk_pc].ist == regex.ist_char_class_neg {
2337                         mut cc_neg := false
2338                         if re.prog[chk_pc].ist == regex.ist_char_class_neg {
2339                             cc_neg = true
2340                         }
2341                         mut cc_res := re.check_char_class(chk_pc, ch_t)
2342 
2343                         if cc_neg {
2344                             cc_res = !cc_res
2345                         }
2346                         next_check_flag = cc_res
2347                         // println("Check [ist_char_class] => $next_check_flag")
2348                     }
2349                     // check bsls
2350                     else if re.prog[chk_pc].ist == regex.ist_bsls_char {
2351                         next_check_flag = re.prog[chk_pc].validator(u8(ch_t))
2352                         // println("Check [ist_bsls_char] => $next_check_flag")
2353                     }
2354                 }
2355 
2356                 // check if we must continue or pass to the next IST
2357                 if next_check_flag == true && re.prog[state.pc + 1].ist != regex.ist_prog_end {
2358                     // println("save the state!!")
2359                     mut dot_state := StateObj{
2360                         group_index: state.group_index
2361                         match_flag: state.match_flag
2362                         match_index: state.match_index
2363                         first_match: state.first_match
2364                         pc: state.pc
2365                         i: state.i + char_len
2366                         char_len: char_len
2367                         last_dot_pc: state.pc
2368                     }
2369                     // if we are managing a \[something]* stay on the same char on return
2370                     if re.prog[state.pc].rep_min == 0 {
2371                         dot_state.i -= char_len
2372                     }
2373 
2374                     re.state_list << dot_state
2375 
2376                     m_state = .ist_quant_n
2377                     // println("dot_char stack len: ${re.state_list.len}")
2378                     continue
2379                 }
2380 
2381                 tmp_res := re.prog[state.pc].validator(u8(ch))
2382                 if tmp_res == false {
2383                     m_state = .ist_quant_n
2384                     continue
2385                 }
2386                 // println("${ch} => ${tmp_res}")
2387 
2388                 state.match_flag = true
2389                 l_ist = u32(regex.ist_dot_char)
2390 
2391                 if state.first_match < 0 {
2392                     state.first_match = state.i
2393                 }
2394                 state.match_index = state.i
2395                 re.prog[state.pc].rep++ // increase repetitions
2396 
2397                 state.i += char_len
2398                 m_state = .ist_quant_p
2399                 continue
2400             }
2401             // simple char IST
2402             else if ist == regex.ist_simple_char {
2403                 // println("ist_simple_char")
2404                 state.match_flag = false
2405 
2406                 if re.prog[state.pc].ch == ch {
2407                     state.match_flag = true
2408                     l_ist = regex.ist_simple_char
2409 
2410                     if state.first_match < 0 {
2411                         state.first_match = state.i
2412                     }
2413                     // println("state.match_index: ${state.match_index}")
2414                     state.match_index = state.i
2415 
2416                     re.prog[state.pc].rep++ // increase repetitions
2417                     state.i += char_len // next char
2418                     m_state = .ist_quant_p
2419                     continue
2420                 }
2421                 m_state = .ist_quant_n
2422                 continue
2423             }
2424             // UNREACHABLE
2425             // println("PANIC2!! state: $m_state")
2426             return regex.err_internal_error, state.i
2427         }
2428         /***********************************
2429         * Quantifier management
2430         ***********************************/
2431         // ist_quant_ng => quantifier negative test on group
2432         else if m_state == .ist_quant_ng {
2433             // we are finished here
2434             if state.group_index < 0 {
2435                 // println("Early stop!")
2436                 result = regex.no_match_found
2437                 m_state = .stop
2438                 continue
2439             }
2440 
2441             tmp_pc := re.group_data[state.group_index] // PC to the end of the group token
2442             rep := re.prog[tmp_pc].group_rep // use a temp variable
2443             re.prog[tmp_pc].group_rep = 0 // clear the repetitions
2444 
2445             // println(".ist_quant_ng group_pc_end: $tmp_pc rep: $rep")
2446 
2447             if rep >= re.prog[tmp_pc].rep_min {
2448                 // println("ist_quant_ng GROUP CLOSED OK group_index: $state.group_index")
2449 
2450                 state.i = re.group_stack[state.group_index]
2451                 state.pc = tmp_pc
2452                 state.group_index--
2453                 m_state = .ist_next
2454                 continue
2455             } else if re.prog[tmp_pc].next_is_or {
2456                 // println("ist_quant_ng OR Negative branch")
2457 
2458                 state.i = re.group_stack[state.group_index]
2459                 state.pc = re.prog[tmp_pc + 1].rep_min - 1
2460                 state.group_index--
2461                 m_state = .ist_next
2462                 continue
2463             } else if rep > 0 && rep < re.prog[tmp_pc].rep_min {
2464                 // println("ist_quant_ng UNDER THE MINIMUM g.i: $state.group_index")
2465 
2466                 // check if we are inside a group, if yes exit from the nested groups
2467                 if state.group_index > 0 {
2468                     state.group_index--
2469                     state.pc = tmp_pc
2470                     m_state = .ist_quant_ng //.ist_next
2471                     continue
2472                 }
2473 
2474                 if state.group_index == 0 {
2475                     state.group_index--
2476                     state.pc = tmp_pc // TEST
2477                     m_state = .ist_next
2478                     continue
2479                 }
2480 
2481                 result = regex.no_match_found
2482                 m_state = .stop
2483                 continue
2484             } else if rep == 0 && rep < re.prog[tmp_pc].rep_min {
2485                 // println("ist_quant_ng c_zero UNDER THE MINIMUM g.i: $state.group_index")
2486 
2487                 if state.group_index > 0 {
2488                     state.group_index--
2489                     state.pc = tmp_pc
2490                     m_state = .ist_quant_ng //.ist_next
2491                     continue
2492                 }
2493 
2494                 result = regex.no_match_found
2495                 m_state = .stop
2496                 continue
2497             }
2498 
2499             // println("DO NOT STAY HERE!! {${re.prog[tmp_pc].rep_min},${re.prog[tmp_pc].rep_max}}:$rep")
2500             // UNREACHABLE
2501             return regex.err_internal_error, state.i
2502         }
2503         // ist_quant_pg => quantifier positive test on group
2504         else if m_state == .ist_quant_pg {
2505             // println(".ist_quant_pg")
2506             mut tmp_pc := state.pc
2507             if state.group_index >= 0 {
2508                 tmp_pc = re.group_data[state.group_index]
2509             }
2510 
2511             if re.prog[tmp_pc].group_neg == true {
2512                 // println("***** Negation of the group")
2513                 result = regex.no_match_found
2514                 m_state = .stop
2515                 continue
2516             }
2517 
2518             rep := re.prog[tmp_pc].group_rep
2519 
2520             if rep < re.prog[tmp_pc].rep_min {
2521                 // println("ist_quant_pg UNDER RANGE")
2522                 state.pc = re.prog[tmp_pc].goto_pc
2523                 m_state = .ist_next
2524                 continue
2525             } else if rep == re.prog[tmp_pc].rep_max {
2526                 // println("ist_quant_pg MAX RANGE")
2527                 re.prog[tmp_pc].group_rep = 0 // clear the repetitions
2528                 state.group_index--
2529                 m_state = .ist_next
2530 
2531                 continue
2532             } else if rep >= re.prog[tmp_pc].rep_min {
2533                 // println("ist_quant_pg IN RANGE group_index:$state.group_index")
2534 
2535                 // check greedy flag, if true exit on minimum
2536                 if re.prog[tmp_pc].greedy == true {
2537                     re.prog[tmp_pc].group_rep = 0 // clear the repetitions
2538                     state.group_index--
2539                     m_state = .ist_next
2540                     continue
2541                 }
2542 
2543                 state.pc = re.prog[tmp_pc].goto_pc - 1
2544                 state.group_index--
2545                 m_state = .ist_next
2546                 continue
2547             }
2548 
2549             // UNREACHABLE
2550             // println("PANIC3!! state: $m_state")
2551             return regex.err_internal_error, state.i
2552         }
2553         // ist_quant_n => quantifier negative test on token
2554         else if m_state == .ist_quant_n {
2555             rep := re.prog[state.pc].rep
2556             // println("Here!! PC $state.pc is_next_or: ${re.prog[state.pc].next_is_or}")
2557 
2558             // zero quantifier * or ?
2559             if rep == 0 && re.prog[state.pc].rep_min == 0 {
2560                 // println("ist_quant_n c_zero RANGE MIN")
2561                 m_state = .ist_next // go to next ist
2562                 continue
2563             }
2564             // match + or *
2565             else if rep >= re.prog[state.pc].rep_min {
2566                 // println("ist_quant_n MATCH RANGE")
2567                 m_state = .ist_next
2568                 continue
2569             }
2570 
2571             // check the OR if present
2572             if re.prog[state.pc].next_is_or {
2573                 // println("OR present on failing")
2574                 state.match_index = -1
2575                 m_state = .ist_next
2576                 continue
2577             }
2578 
2579             // we are in a group manage no match from here
2580             if state.group_index >= 0 {
2581                 // println("ist_quant_n FAILED insied a GROUP group_index:$state.group_index")
2582                 m_state = .ist_quant_ng
2583                 continue
2584             }
2585 
2586             // no other options
2587             // println("ist_quant_n no_match_found")
2588             result = regex.no_match_found
2589             m_state = .stop
2590 
2591             // stop already started matching outside a capturing group
2592             if re.state_list.len > 0 && re.state_list.last().group_index == -1
2593                 && re.state_list.last().last_dot_pc > 0 {
2594                 if ist == regex.ist_dot_char || ist == regex.ist_bsls_char {
2595                     return regex.no_match_found, 0
2596                 }
2597             }
2598             continue
2599         }
2600         // ist_quant_p => quantifier positive test on token
2601         else if m_state == .ist_quant_p {
2602             // println("Here .ist_quant_p")
2603             // exit on first match
2604             if (re.flag & regex.f_efm) != 0 {
2605                 return state.i, state.i + 1
2606             }
2607 
2608             rep := re.prog[state.pc].rep
2609             // println(rep)
2610 
2611             // under range
2612             if rep > 0 && rep < re.prog[state.pc].rep_min {
2613                 // println("ist_quant_p UNDER RANGE")
2614                 m_state = .ist_load // continue the loop
2615                 continue
2616             }
2617             // range ok, continue loop
2618             else if rep >= re.prog[state.pc].rep_min && rep < re.prog[state.pc].rep_max {
2619                 // println("ist_quant_p IN RANGE")
2620 
2621                 // check greedy flag, if true exit on minimum
2622                 if re.prog[state.pc].greedy == true {
2623                     m_state = .ist_next
2624                     continue
2625                 }
2626                 m_state = .ist_load
2627                 continue
2628             }
2629             // max reached
2630             else if rep == re.prog[state.pc].rep_max {
2631                 // println("ist_quant_p MAX RANGE")
2632                 m_state = .ist_next
2633                 continue
2634             }
2635         }
2636         // UNREACHABLE
2637         // println("PANIC4!! state: $m_state")
2638         return regex.err_internal_error, state.i
2639     }
2640 
2641     // println("Check end of text!")
2642     // Check the results
2643     if state.match_index >= 0 {
2644         if state.group_index < 0 {
2645             if re.prog[state.pc].ist == regex.ist_prog_end {
2646                 // println("program ended!!")
2647 
2648                 if (re.flag & regex.f_src) != 0 {
2649                     // println("find return")
2650                     return state.first_match, state.i
2651                 } else {
2652                     // println("Here!!")
2653                     return 0, state.i
2654                 }
2655             }
2656 
2657             // println("No Group here, natural end [$state.first_match,$state.i] state: ${state_str(m_state)} ist: $ist pgr_end: $re.prog.len")
2658 
2659             if re.prog[state.pc + 1].ist == regex.ist_prog_end
2660                 || re.prog[state.pc].ist == regex.ist_prog_end {
2661                 rep := re.prog[state.pc].rep
2662                 // println("rep: $rep re.prog[state.pc].rep_min: ${re.prog[state.pc].rep_min} re.prog[state.pc].rep_max: ${re.prog[state.pc].rep_max}")
2663                 if rep >= re.prog[state.pc].rep_min && rep <= re.prog[state.pc].rep_max {
2664                     return state.first_match, state.i
2665                 }
2666                 // println("Program not finished! ")
2667                 return regex.no_match_found, state.i
2668             }
2669             if src_end {
2670                 // println("program end")
2671                 return state.first_match, state.i
2672             }
2673             // print("No match found!!")
2674             return regex.no_match_found, state.i
2675         } else {
2676             // println("Group match! OK")
2677             // println("first_match: $state.first_match, i: $state.i")
2678 
2679             // println("Skip last group")
2680             return state.first_match, state.i
2681             // return state.first_match,re.group_stack[state.group_index--]
2682         }
2683     }
2684     // println("no_match_found, natural end")
2685     return regex.no_match_found, state.i
2686 }