1 | /* |
2 | regex 1.0 alpha |
3 | |
4 | Copyright (c) 2019-2023 Dario Deledda. All rights reserved. |
5 | Use of this source code is governed by an MIT license |
6 | that can be found in the LICENSE file. |
7 | |
8 | This file contains regex module |
9 | |
10 | Know limitation: |
11 | - find is implemented in a trivial way |
12 | - not full compliant PCRE |
13 | - not compliant POSIX ERE |
14 | */ |
15 | module regex |
16 | |
17 | import strings |
18 | |
19 | pub const ( |
20 | v_regex_version = '1.0 alpha' // regex module version |
21 | |
22 | max_code_len = 256 // default small base code len for the regex programs |
23 | max_quantifier = 1073741824 // default max repetitions allowed for the quantifiers = 2^30 |
24 | // spaces chars (here only westerns!!) TODO: manage all the spaces from unicode |
25 | spaces = [` `, `\t`, `\n`, `\r`, `\v`, `\f`] |
26 | // new line chars for now only '\n' |
27 | new_line_list = [`\n`, `\r`] |
28 | |
29 | // Results |
30 | no_match_found = -1 |
31 | |
32 | // Errors |
33 | compile_ok = 0 // the regex string compiled, all ok |
34 | err_char_unknown = -2 // the char used is unknow to the system |
35 | err_undefined = -3 // the compiler symbol is undefined |
36 | err_internal_error = -4 // Bug in the regex system!! |
37 | err_cc_alloc_overflow = -5 // memory for char class full!! |
38 | err_syntax_error = -6 // syntax error in regex compiling |
39 | err_groups_overflow = -7 // max number of groups reached |
40 | err_groups_max_nested = -8 // max number of nested group reached |
41 | err_group_not_balanced = -9 // group not balanced |
42 | err_group_qm_notation = -10 // group invalid notation |
43 | err_invalid_or_with_cc = -11 // invalid or on two consecutive char class |
44 | err_neg_group_quantifier = -12 // negation groups can not have quantifier |
45 | err_consecutive_dots = -13 // two consecutive dots is an error |
46 | ) |
47 | |
48 | const ( |
49 | //************************************* |
50 | // regex program instructions |
51 | //************************************* |
52 | ist_simple_char = u32(0x7FFFFFFF) // single char instruction, 31 bit available to char |
53 | // char class 11 0100 AA xxxxxxxx |
54 | // AA = 00 regular class |
55 | // AA = 01 Negated class ^ char |
56 | ist_char_class = u32(0xD1000000) // MASK |
57 | ist_char_class_pos = u32(0xD0000000) // char class normal [abc] |
58 | ist_char_class_neg = u32(0xD1000000) // char class negate [^abc] |
59 | // dot char 10 0110 xx xxxxxxxx |
60 | ist_dot_char = u32(0x98000000) // match any char except \n |
61 | // backslash chars 10 0100 xx xxxxxxxx |
62 | ist_bsls_char = u32(0x90000000) // backslash char |
63 | // OR | 10 010Y xx xxxxxxxx |
64 | ist_or_branch = u32(0x91000000) // OR case |
65 | // groups 10 010Y xx xxxxxxxx |
66 | ist_group_start = u32(0x92000000) // group start ( |
67 | ist_group_end = u32(0x94000000) // group end ) |
68 | // control instructions |
69 | ist_prog_end = u32(0x88000000) // 10 0010 xx xxxxxxxx |
70 | //************************************* |
71 | ) |
72 | |
73 | /* |
74 | General Utilities |
75 | */ |
76 | // utf8util_char_len calculate the length in bytes of a utf8 char |
77 | [inline] |
78 | fn utf8util_char_len(b u8) int { |
79 | return ((0xe5000000 >> ((b >> 3) & 0x1e)) & 3) + 1 |
80 | } |
81 | |
82 | // get_char get a char from position i and return an u32 with the unicode code |
83 | [direct_array_access; inline] |
84 | fn (re RE) get_char(in_txt string, i int) (u32, int) { |
85 | ini := unsafe { in_txt.str[i] } |
86 | // ascii 8 bit |
87 | if (re.flag & regex.f_bin) != 0 || ini & 0x80 == 0 { |
88 | return u32(ini), 1 |
89 | } |
90 | // unicode char |
91 | char_len := utf8util_char_len(ini) |
92 | mut tmp := 0 |
93 | mut ch := u32(0) |
94 | for tmp < char_len { |
95 | ch = (ch << 8) | unsafe { in_txt.str[i + tmp] } |
96 | tmp++ |
97 | } |
98 | return ch, char_len |
99 | } |
100 | |
101 | // get_charb get a char from position i and return an u32 with the unicode code |
102 | [direct_array_access; inline] |
103 | fn (re RE) get_charb(in_txt &u8, i int) (u32, int) { |
104 | // ascii 8 bit |
105 | if (re.flag & regex.f_bin) != 0 || unsafe { in_txt[i] } & 0x80 == 0 { |
106 | return u32(unsafe { in_txt[i] }), 1 |
107 | } |
108 | // unicode char |
109 | char_len := utf8util_char_len(unsafe { in_txt[i] }) |
110 | mut tmp := 0 |
111 | mut ch := u32(0) |
112 | for tmp < char_len { |
113 | ch = (ch << 8) | unsafe { in_txt[i + tmp] } |
114 | tmp++ |
115 | } |
116 | return ch, char_len |
117 | } |
118 | |
119 | [inline] |
120 | fn is_alnum(in_char u8) bool { |
121 | mut tmp := in_char - `A` |
122 | if tmp <= 25 { |
123 | return true |
124 | } |
125 | tmp = in_char - `a` |
126 | if tmp <= 25 { |
127 | return true |
128 | } |
129 | tmp = in_char - `0` |
130 | if tmp <= 9 { |
131 | return true |
132 | } |
133 | if in_char == `_` { |
134 | return true |
135 | } |
136 | return false |
137 | } |
138 | |
139 | [inline] |
140 | fn is_not_alnum(in_char u8) bool { |
141 | return !is_alnum(in_char) |
142 | } |
143 | |
144 | [inline] |
145 | fn is_space(in_char u8) bool { |
146 | return in_char in regex.spaces |
147 | } |
148 | |
149 | [inline] |
150 | fn is_not_space(in_char u8) bool { |
151 | return !is_space(in_char) |
152 | } |
153 | |
154 | [inline] |
155 | fn is_digit(in_char u8) bool { |
156 | tmp := in_char - `0` |
157 | return tmp <= 0x09 |
158 | } |
159 | |
160 | [inline] |
161 | fn is_not_digit(in_char u8) bool { |
162 | return !is_digit(in_char) |
163 | } |
164 | |
165 | /* |
166 | [inline] |
167 | fn is_wordchar(in_char byte) bool { |
168 | return is_alnum(in_char) || in_char == `_` |
169 | } |
170 | |
171 | [inline] |
172 | fn is_not_wordchar(in_char byte) bool { |
173 | return !is_alnum(in_char) |
174 | } |
175 | */ |
176 | |
177 | [inline] |
178 | fn is_lower(in_char u8) bool { |
179 | tmp := in_char - `a` |
180 | return tmp <= 25 |
181 | } |
182 | |
183 | [inline] |
184 | fn is_upper(in_char u8) bool { |
185 | tmp := in_char - `A` |
186 | return tmp <= 25 |
187 | } |
188 | |
189 | pub fn (re RE) get_parse_error_string(err int) string { |
190 | match err { |
191 | regex.compile_ok { return 'compile_ok' } |
192 | regex.no_match_found { return 'no_match_found' } |
193 | regex.err_char_unknown { return 'err_char_unknown' } |
194 | regex.err_undefined { return 'err_undefined' } |
195 | regex.err_internal_error { return 'err_internal_error' } |
196 | regex.err_cc_alloc_overflow { return 'err_cc_alloc_overflow' } |
197 | regex.err_syntax_error { return 'err_syntax_error' } |
198 | regex.err_groups_overflow { return 'err_groups_overflow' } |
199 | regex.err_groups_max_nested { return 'err_groups_max_nested' } |
200 | regex.err_group_not_balanced { return 'err_group_not_balanced' } |
201 | regex.err_group_qm_notation { return 'err_group_qm_notation' } |
202 | regex.err_invalid_or_with_cc { return 'err_invalid_or_with_cc' } |
203 | regex.err_neg_group_quantifier { return 'err_neg_group_quantifier' } |
204 | regex.err_consecutive_dots { return 'err_consecutive_dots' } |
205 | else { return 'err_unknown' } |
206 | } |
207 | } |
208 | |
209 | // utf8_str convert and utf8 sequence to a printable string |
210 | [inline] |
211 | fn utf8_str(ch rune) string { |
212 | mut i := 4 |
213 | mut res := '' |
214 | for i > 0 { |
215 | v := u8((ch >> ((i - 1) * 8)) & 0xFF) |
216 | if v != 0 { |
217 | res += '${v:1c}' |
218 | } |
219 | i-- |
220 | } |
221 | return res |
222 | } |
223 | |
224 | // simple_log default log function |
225 | fn simple_log(txt string) { |
226 | print(txt) |
227 | } |
228 | |
229 | /****************************************************************************** |
230 | * |
231 | * Token Structs |
232 | * |
233 | ******************************************************************************/ |
234 | pub type FnValidator = fn (u8) bool |
235 | |
236 | struct Token { |
237 | mut: |
238 | ist rune |
239 | // char |
240 | ch rune // char of the token if any |
241 | ch_len u8 // char len |
242 | // Quantifiers / branch |
243 | rep_min int // used also for jump next in the OR branch [no match] pc jump |
244 | rep_max int // used also for jump next in the OR branch [ match] pc jump |
245 | greedy bool // greedy quantifier flag |
246 | // Char class |
247 | cc_index int = -1 |
248 | // counters for quantifier check (repetitions) |
249 | rep int |
250 | // validator function pointer |
251 | validator FnValidator = unsafe { nil } |
252 | // groups variables |
253 | group_neg bool // negation flag for the group, 0 => no negation > 0 => negataion |
254 | group_rep int // repetition of the group |
255 | group_id int = -1 // id of the group |
256 | goto_pc int = -1 // jump to this PC if is needed |
257 | // OR flag for the token |
258 | next_is_or bool // true if the next token is an OR |
259 | // dot_char token variables |
260 | dot_check_pc int = -1 // pc of the next token to check for dots |
261 | bsls_check_pc int = -1 // pc of the next token to check for bsls |
262 | cc_check_pc int = -1 // pc of the next token to check for CC |
263 | last_dot_flag bool // if true indicate that is the last dot_char in the regex |
264 | // debug fields |
265 | source_index int |
266 | } |
267 | |
268 | [inline] |
269 | fn (mut tok Token) reset() { |
270 | tok.rep = 0 |
271 | } |
272 | |
273 | /****************************************************************************** |
274 | * |
275 | * Regex struct |
276 | * |
277 | ******************************************************************************/ |
278 | pub const ( |
279 | f_nl = 0x00000001 // end the match when find a new line symbol |
280 | f_ms = 0x00000002 // match true only if the match is at the start of the string |
281 | f_me = 0x00000004 // match true only if the match is at the end of the string |
282 | |
283 | f_efm = 0x00000100 // exit on first token matched, used by search |
284 | f_bin = 0x00000200 // work only on bytes, ignore utf-8 |
285 | // behaviour modifier flags |
286 | f_src = 0x00020000 // search mode enabled |
287 | ) |
288 | |
289 | // Log function prototype |
290 | pub type FnLog = fn (string) |
291 | |
292 | pub struct RE { |
293 | pub mut: |
294 | prog []Token |
295 | prog_len int // regex program len |
296 | // char classes storage |
297 | cc []CharClass // char class list |
298 | cc_index int // index |
299 | // groups |
300 | group_count int // number of groups in this regex struct |
301 | groups []int // groups index results |
302 | group_max_nested int = 3 // max nested group |
303 | group_max int = 8 // max allowed number of different groups |
304 | |
305 | state_list []StateObj |
306 | |
307 | group_csave_flag bool // flag to enable continuous saving |
308 | group_csave []int //= []int{} // groups continuous save list |
309 | |
310 | group_map map[string]int // groups names map |
311 | |
312 | group_stack []int |
313 | group_data []int |
314 | // flags |
315 | flag int // flag for optional parameters |
316 | // Debug/log |
317 | debug int // enable in order to have the unroll of the code 0 = NO_DEBUG, 1 = LIGHT 2 = VERBOSE |
318 | log_func FnLog = simple_log // log function, can be customized by the user |
319 | query string // query string |
320 | } |
321 | |
322 | // Reset RE object |
323 | [direct_array_access; inline] |
324 | pub fn (mut re RE) reset() { |
325 | re.cc_index = 0 |
326 | |
327 | mut i := 0 |
328 | for i < re.prog_len { |
329 | re.prog[i].group_rep = 0 // clear repetition of the group |
330 | re.prog[i].rep = 0 // clear repetition of the token |
331 | i++ |
332 | } |
333 | |
334 | // init groups array |
335 | if re.group_count > 0 { |
336 | if re.groups.len == 0 { |
337 | // first run alloc memory |
338 | re.groups = []int{len: re.group_count * 2, init: -1} |
339 | } else { |
340 | // subsequent executions, only clean up the memory |
341 | i = 0 |
342 | for i < re.groups.len { |
343 | re.groups[i] = -1 |
344 | i++ |
345 | } |
346 | } |
347 | } |
348 | |
349 | // reset group_csave |
350 | if re.group_csave_flag == true { |
351 | re.group_csave.clear() // = []int{} |
352 | } |
353 | |
354 | // reset state list |
355 | re.state_list.clear() |
356 | re.group_stack.clear() |
357 | } |
358 | |
359 | // reset for search mode fail |
360 | // gcc bug, dont use [inline] or go 5 time slower |
361 | //[inline] |
362 | [direct_array_access] |
363 | fn (mut re RE) reset_src() { |
364 | mut i := 0 |
365 | for i < re.prog_len { |
366 | re.prog[i].group_rep = 0 // clear repetition of the group |
367 | re.prog[i].rep = 0 // clear repetition of the token |
368 | i++ |
369 | } |
370 | } |
371 | |
372 | /****************************************************************************** |
373 | * |
374 | * Backslashes chars |
375 | * |
376 | ******************************************************************************/ |
377 | struct BslsStruct { |
378 | ch rune // meta char |
379 | validator FnValidator = unsafe { nil } // validator function pointer |
380 | } |
381 | |
382 | const ( |
383 | bsls_validator_array = [ |
384 | BslsStruct{`w`, is_alnum}, |
385 | BslsStruct{`W`, is_not_alnum}, |
386 | BslsStruct{`s`, is_space}, |
387 | BslsStruct{`S`, is_not_space}, |
388 | BslsStruct{`d`, is_digit}, |
389 | BslsStruct{`D`, is_not_digit}, |
390 | BslsStruct{`a`, is_lower}, |
391 | BslsStruct{`A`, is_upper}, |
392 | ] |
393 | |
394 | // these chars are escape if preceded by a \ |
395 | bsls_escape_list = [`\\`, `|`, `.`, `:`, `*`, `+`, `-`, `{`, `}`, `[`, `]`, `(`, `)`, `?`, |
396 | `^`, `!`] |
397 | ) |
398 | |
399 | enum BSLS_parse_state { |
400 | start |
401 | bsls_found |
402 | bsls_char |
403 | normal_char |
404 | } |
405 | |
406 | // parse_bsls return (index, str_len) bsls_validator_array index, len of the backslash sequence if present |
407 | fn (re RE) parse_bsls(in_txt string, in_i int) (int, int) { |
408 | mut status := BSLS_parse_state.start |
409 | mut i := in_i |
410 | |
411 | for i < in_txt.len { |
412 | // get our char |
413 | char_tmp, char_len := re.get_char(in_txt, i) |
414 | ch := u8(char_tmp) |
415 | |
416 | if status == .start && ch == `\\` { |
417 | status = .bsls_found |
418 | i += char_len |
419 | continue |
420 | } |
421 | |
422 | // check if is our bsls char, for now only one length sequence |
423 | if status == .bsls_found { |
424 | for c, x in regex.bsls_validator_array { |
425 | if x.ch == ch { |
426 | return c, i - in_i + 1 |
427 | } |
428 | } |
429 | status = .normal_char |
430 | continue |
431 | } |
432 | |
433 | // no BSLS validator, manage as normal escape char char |
434 | if status == .normal_char { |
435 | if ch in regex.bsls_escape_list { |
436 | return regex.no_match_found, i - in_i + 1 |
437 | } |
438 | return regex.err_syntax_error, i - in_i + 1 |
439 | } |
440 | |
441 | // at the present time we manage only one char after the \ |
442 | break |
443 | } |
444 | // not our bsls return KO |
445 | return regex.err_syntax_error, i |
446 | } |
447 | |
448 | /****************************************************************************** |
449 | * |
450 | * Char class |
451 | * |
452 | ******************************************************************************/ |
453 | const ( |
454 | cc_null = 0 // empty cc token |
455 | cc_char = 1 // simple char: a |
456 | cc_int = 2 // char interval: a-z |
457 | cc_bsls = 3 // backslash char |
458 | cc_end = 4 // cc sequence terminator |
459 | ) |
460 | |
461 | struct CharClass { |
462 | mut: |
463 | cc_type int // type of cc token |
464 | ch0 rune // first char of the interval a-b a in this case |
465 | ch1 rune // second char of the interval a-b b in this case |
466 | validator FnValidator = unsafe { nil } // validator function pointer |
467 | } |
468 | |
469 | enum CharClass_parse_state { |
470 | start |
471 | in_char |
472 | in_bsls |
473 | separator |
474 | finish |
475 | } |
476 | |
477 | fn (re RE) get_char_class(pc int) string { |
478 | buf := []u8{len: (re.cc.len)} |
479 | mut buf_ptr := unsafe { &u8(&buf) } |
480 | |
481 | mut cc_i := re.prog[pc].cc_index |
482 | mut i := 0 |
483 | mut tmp := 0 |
484 | for cc_i >= 0 && cc_i < re.cc.len && re.cc[cc_i].cc_type != regex.cc_end { |
485 | if re.cc[cc_i].cc_type == regex.cc_bsls { |
486 | unsafe { |
487 | buf_ptr[i] = `\\` |
488 | i++ |
489 | buf_ptr[i] = u8(re.cc[cc_i].ch0) |
490 | i++ |
491 | } |
492 | } else if re.cc[cc_i].ch0 == re.cc[cc_i].ch1 { |
493 | tmp = 3 |
494 | for tmp >= 0 { |
495 | x := u8((re.cc[cc_i].ch0 >> (tmp * 8)) & 0xFF) |
496 | if x != 0 { |
497 | unsafe { |
498 | buf_ptr[i] = x |
499 | i++ |
500 | } |
501 | } |
502 | tmp-- |
503 | } |
504 | } else { |
505 | tmp = 3 |
506 | for tmp >= 0 { |
507 | x := u8((re.cc[cc_i].ch0 >> (tmp * 8)) & 0xFF) |
508 | if x != 0 { |
509 | unsafe { |
510 | buf_ptr[i] = x |
511 | i++ |
512 | } |
513 | } |
514 | tmp-- |
515 | } |
516 | unsafe { |
517 | buf_ptr[i] = `-` |
518 | i++ |
519 | } |
520 | tmp = 3 |
521 | for tmp >= 0 { |
522 | x := u8((re.cc[cc_i].ch1 >> (tmp * 8)) & 0xFF) |
523 | if x != 0 { |
524 | unsafe { |
525 | buf_ptr[i] = x |
526 | i++ |
527 | } |
528 | } |
529 | tmp-- |
530 | } |
531 | } |
532 | cc_i++ |
533 | } |
534 | unsafe { |
535 | buf_ptr[i] = u8(0) |
536 | } |
537 | return unsafe { tos_clone(buf_ptr) } |
538 | } |
539 | |
540 | fn (re RE) check_char_class(pc int, ch rune) bool { |
541 | mut cc_i := re.prog[pc].cc_index |
542 | for cc_i >= 0 && cc_i < re.cc.len && re.cc[cc_i].cc_type != regex.cc_end { |
543 | if re.cc[cc_i].cc_type == regex.cc_bsls { |
544 | if re.cc[cc_i].validator(u8(ch)) { |
545 | return true |
546 | } |
547 | } else if ch >= re.cc[cc_i].ch0 && ch <= re.cc[cc_i].ch1 { |
548 | return true |
549 | } |
550 | cc_i++ |
551 | } |
552 | return false |
553 | } |
554 | |
555 | // parse_char_class return (index, str_len, cc_type) of a char class [abcm-p], char class start after the [ char |
556 | fn (mut re RE) parse_char_class(in_txt string, in_i int) (int, int, rune) { |
557 | mut status := CharClass_parse_state.start |
558 | mut i := in_i |
559 | |
560 | mut tmp_index := re.cc_index |
561 | res_index := re.cc_index |
562 | |
563 | mut cc_type := u32(regex.ist_char_class_pos) |
564 | |
565 | for i < in_txt.len { |
566 | // check if we are out of memory for char classes |
567 | if tmp_index >= re.cc.len { |
568 | return regex.err_cc_alloc_overflow, 0, u32(0) |
569 | } |
570 | |
571 | // get our char |
572 | char_tmp, char_len := re.get_char(in_txt, i) |
573 | ch := u8(char_tmp) |
574 | |
575 | // println("CC #${i:3d} ch: ${ch:c}") |
576 | |
577 | // negation |
578 | if status == .start && ch == `^` { |
579 | cc_type = u32(regex.ist_char_class_neg) |
580 | i += char_len |
581 | continue |
582 | } |
583 | |
584 | // minus symbol |
585 | if status == .start && ch == `-` { |
586 | re.cc[tmp_index].cc_type = regex.cc_char |
587 | re.cc[tmp_index].ch0 = char_tmp |
588 | re.cc[tmp_index].ch1 = char_tmp |
589 | i += char_len |
590 | tmp_index++ |
591 | continue |
592 | } |
593 | |
594 | // bsls |
595 | if (status == .start || status == .in_char) && ch == `\\` { |
596 | // println("CC bsls.") |
597 | status = .in_bsls |
598 | i += char_len |
599 | continue |
600 | } |
601 | |
602 | if status == .in_bsls { |
603 | // println("CC bsls validation.") |
604 | for c, x in regex.bsls_validator_array { |
605 | if x.ch == ch { |
606 | // println("CC bsls found [${ch:c}]") |
607 | re.cc[tmp_index].cc_type = regex.cc_bsls |
608 | re.cc[tmp_index].ch0 = regex.bsls_validator_array[c].ch |
609 | re.cc[tmp_index].ch1 = regex.bsls_validator_array[c].ch |
610 | re.cc[tmp_index].validator = regex.bsls_validator_array[c].validator |
611 | i += char_len |
612 | tmp_index++ |
613 | status = .in_char |
614 | break |
615 | } |
616 | } |
617 | if status == .in_bsls { |
618 | // manage as a simple char |
619 | // println("CC bsls not found [${ch:c}]") |
620 | re.cc[tmp_index].cc_type = regex.cc_char |
621 | re.cc[tmp_index].ch0 = char_tmp |
622 | re.cc[tmp_index].ch1 = char_tmp |
623 | i += char_len |
624 | tmp_index++ |
625 | status = .in_char |
626 | continue |
627 | } else { |
628 | continue |
629 | } |
630 | } |
631 | |
632 | // simple char |
633 | if (status == .start || status == .in_char) && ch != `-` && ch != `]` { |
634 | status = .in_char |
635 | |
636 | re.cc[tmp_index].cc_type = regex.cc_char |
637 | re.cc[tmp_index].ch0 = char_tmp |
638 | re.cc[tmp_index].ch1 = char_tmp |
639 | |
640 | i += char_len |
641 | tmp_index++ |
642 | continue |
643 | } |
644 | |
645 | // check range separator |
646 | if status == .in_char && ch == `-` { |
647 | status = .separator |
648 | i += char_len |
649 | continue |
650 | } |
651 | |
652 | // check range end |
653 | if status == .separator && ch != `]` && ch != `-` { |
654 | status = .in_char |
655 | re.cc[tmp_index - 1].cc_type = regex.cc_int |
656 | re.cc[tmp_index - 1].ch1 = char_tmp |
657 | i += char_len |
658 | continue |
659 | } |
660 | |
661 | // char class end |
662 | if status == .in_char && ch == `]` { |
663 | re.cc[tmp_index].cc_type = regex.cc_end |
664 | re.cc[tmp_index].ch0 = 0 |
665 | re.cc[tmp_index].ch1 = 0 |
666 | re.cc_index = tmp_index + 1 |
667 | |
668 | return res_index, i - in_i + 2, cc_type |
669 | } |
670 | |
671 | i++ |
672 | } |
673 | return regex.err_syntax_error, 0, u32(0) |
674 | } |
675 | |
676 | /****************************************************************************** |
677 | * |
678 | * Re Compiler |
679 | * |
680 | ******************************************************************************/ |
681 | // |
682 | // Quantifier |
683 | // |
684 | enum Quant_parse_state { |
685 | start |
686 | min_parse |
687 | comma_checked |
688 | max_parse |
689 | greedy |
690 | gredy_parse |
691 | finish |
692 | } |
693 | |
694 | // parse_quantifier return (min, max, str_len, greedy_flag) of a {min,max}? quantifier starting after the { char |
695 | fn (re RE) parse_quantifier(in_txt string, in_i int) (int, int, int, bool) { |
696 | mut status := Quant_parse_state.start |
697 | mut i := in_i |
698 | |
699 | mut q_min := 0 // default min in a {} quantifier is 1 |
700 | mut q_max := 0 // default max in a {} quantifier is max_quantifier |
701 | |
702 | mut ch := u8(0) |
703 | |
704 | for i < in_txt.len { |
705 | unsafe { |
706 | ch = in_txt.str[i] |
707 | } |
708 | // println("${ch:c} status: $status") |
709 | |
710 | // exit on no compatible char with {} quantifier |
711 | if utf8util_char_len(ch) != 1 { |
712 | return regex.err_syntax_error, i, 0, false |
713 | } |
714 | |
715 | // min parsing skip if comma present |
716 | if status == .start && ch == `,` { |
717 | q_min = 0 // default min in a {} quantifier is 0 |
718 | status = .comma_checked |
719 | i++ |
720 | continue |
721 | } |
722 | |
723 | if status == .start && is_digit(ch) { |
724 | status = .min_parse |
725 | q_min *= 10 |
726 | q_min += int(ch - `0`) |
727 | i++ |
728 | continue |
729 | } |
730 | |
731 | if status == .min_parse && is_digit(ch) { |
732 | q_min *= 10 |
733 | q_min += int(ch - `0`) |
734 | i++ |
735 | continue |
736 | } |
737 | |
738 | // we have parsed the min, now check the max |
739 | if status == .min_parse && ch == `,` { |
740 | status = .comma_checked |
741 | i++ |
742 | continue |
743 | } |
744 | |
745 | // single value {4} |
746 | if status == .min_parse && ch == `}` { |
747 | q_max = q_min |
748 | status = .greedy |
749 | continue |
750 | } |
751 | |
752 | // end without max |
753 | if status == .comma_checked && ch == `}` { |
754 | q_max = regex.max_quantifier |
755 | status = .greedy |
756 | continue |
757 | } |
758 | |
759 | // start max parsing |
760 | if status == .comma_checked && is_digit(ch) { |
761 | status = .max_parse |
762 | q_max *= 10 |
763 | q_max += int(ch - `0`) |
764 | i++ |
765 | continue |
766 | } |
767 | |
768 | // parse the max |
769 | if status == .max_parse && is_digit(ch) { |
770 | q_max *= 10 |
771 | q_max += int(ch - `0`) |
772 | i++ |
773 | continue |
774 | } |
775 | |
776 | // finished the quantifier |
777 | if status == .max_parse && ch == `}` { |
778 | status = .greedy |
779 | continue |
780 | } |
781 | |
782 | // check if greedy flag char ? is present |
783 | if status == .greedy { |
784 | if i + 1 < in_txt.len { |
785 | i++ |
786 | status = .gredy_parse |
787 | continue |
788 | } |
789 | return q_min, q_max, i - in_i + 2, false |
790 | } |
791 | |
792 | // check the greedy flag |
793 | if status == .gredy_parse { |
794 | if ch == `?` { |
795 | return q_min, q_max, i - in_i + 2, true |
796 | } else { |
797 | i-- |
798 | return q_min, q_max, i - in_i + 2, false |
799 | } |
800 | } |
801 | |
802 | // not a {} quantifier, exit |
803 | return regex.err_syntax_error, i, 0, false |
804 | } |
805 | |
806 | // not a conform {} quantifier |
807 | return regex.err_syntax_error, i, 0, false |
808 | } |
809 | |
810 | // |
811 | // Groups |
812 | // |
813 | enum Group_parse_state { |
814 | start |
815 | q_mark // (? |
816 | q_mark1 // (?:|P checking |
817 | p_status // (?P |
818 | p_start // (?P< |
819 | p_end // (?P<...> |
820 | p_in_name // (?P<... |
821 | finish |
822 | } |
823 | |
824 | // parse_groups parse a group for ? (question mark) syntax, if found, return (error, capture_flag, negate_flag, name_of_the_group, next_index) |
825 | fn (re RE) parse_groups(in_txt string, in_i int) (int, bool, bool, string, int) { |
826 | mut status := Group_parse_state.start |
827 | mut i := in_i |
828 | mut name := '' |
829 | |
830 | for i < in_txt.len && status != .finish { |
831 | // get our char |
832 | char_tmp, char_len := re.get_char(in_txt, i) |
833 | ch := u8(char_tmp) |
834 | |
835 | // start |
836 | if status == .start && ch == `(` { |
837 | status = .q_mark |
838 | i += char_len |
839 | continue |
840 | } |
841 | |
842 | // check for question marks |
843 | if status == .q_mark && ch == `?` { |
844 | status = .q_mark1 |
845 | i += char_len |
846 | continue |
847 | } |
848 | |
849 | // negate group |
850 | if status == .q_mark1 && ch == `!` { |
851 | i += char_len |
852 | return 0, false, true, name, i |
853 | } |
854 | |
855 | // non capturing group |
856 | if status == .q_mark1 && ch == `:` { |
857 | i += char_len |
858 | return 0, false, false, name, i |
859 | } |
860 | |
861 | // enter in P section |
862 | if status == .q_mark1 && ch == `P` { |
863 | status = .p_status |
864 | i += char_len |
865 | continue |
866 | } |
867 | |
868 | // not a valid q mark found |
869 | if status == .q_mark1 { |
870 | // println("NO VALID Q MARK") |
871 | return -2, true, false, name, i |
872 | } |
873 | |
874 | if status == .p_status && ch == `<` { |
875 | status = .p_start |
876 | i += char_len |
877 | continue |
878 | } |
879 | |
880 | if status == .p_start && ch != `>` { |
881 | status = .p_in_name |
882 | name += '${ch:1c}' // TODO: manage utf8 chars |
883 | i += char_len |
884 | continue |
885 | } |
886 | |
887 | // colect name |
888 | if status == .p_in_name && ch != `>` && is_alnum(ch) { |
889 | name += '${ch:1c}' // TODO: manage utf8 chars |
890 | i += char_len |
891 | continue |
892 | } |
893 | |
894 | // end name |
895 | if status == .p_in_name && ch == `>` { |
896 | i += char_len |
897 | return 0, true, false, name, i |
898 | } |
899 | |
900 | // error on name group |
901 | if status == .p_in_name { |
902 | return -2, true, false, name, i |
903 | } |
904 | |
905 | // normal group, nothig to do, exit |
906 | return 0, true, false, name, i |
907 | } |
908 | // UNREACHABLE |
909 | // println("ERROR!! NOT MEANT TO BE HERE!!1") |
910 | return -2, true, false, name, i |
911 | } |
912 | |
913 | const ( |
914 | quntifier_chars = [rune(`+`), `*`, `?`, `{`] |
915 | ) |
916 | |
917 | // |
918 | // main compiler |
919 | // |
920 | // compile return (return code, index) where index is the index of the error in the query string if return code is an error code |
921 | fn (mut re RE) impl_compile(in_txt string) (int, int) { |
922 | mut i := 0 // input string index |
923 | mut pc := 0 // program counter |
924 | |
925 | // group management variables |
926 | mut group_count := -1 |
927 | mut group_stack := []int{len: re.group_max_nested, init: 0} |
928 | mut group_stack_txt_index := []int{len: re.group_max_nested, init: -1} |
929 | mut group_stack_index := -1 |
930 | |
931 | re.query = in_txt // save the query string |
932 | |
933 | i = 0 |
934 | for i < in_txt.len { |
935 | mut char_tmp := u32(0) |
936 | mut char_len := 0 |
937 | // println("i: ${i:3d} ch: ${in_txt.str[i]:c}") |
938 | |
939 | char_tmp, char_len = re.get_char(in_txt, i) |
940 | |
941 | // |
942 | // check special cases: $ ^ |
943 | // |
944 | if char_len == 1 && i == 0 && u8(char_tmp) == `^` { |
945 | re.flag = regex.f_ms |
946 | i = i + char_len |
947 | continue |
948 | } |
949 | if char_len == 1 && i == (in_txt.len - 1) && u8(char_tmp) == `$` { |
950 | re.flag = regex.f_me |
951 | i = i + char_len |
952 | continue |
953 | } |
954 | |
955 | // ist_group_start |
956 | if char_len == 1 && pc >= 0 && u8(char_tmp) == `(` { |
957 | // check max groups allowed |
958 | if group_count > re.group_max { |
959 | return regex.err_groups_overflow, i + 1 |
960 | } |
961 | group_stack_index++ |
962 | |
963 | // check max nested groups allowed |
964 | if group_stack_index > re.group_max_nested { |
965 | return regex.err_groups_max_nested, i + 1 |
966 | } |
967 | |
968 | tmp_res, cgroup_flag, negate_flag, cgroup_name, next_i := re.parse_groups(in_txt, |
969 | i) |
970 | |
971 | // manage question mark format error |
972 | if tmp_res < -1 { |
973 | return regex.err_group_qm_notation, next_i |
974 | } |
975 | |
976 | // println("Parse group: [$tmp_res, $cgroup_flag, ($i,$next_i), '${in_txt[i..next_i]}' ]") |
977 | i = next_i |
978 | |
979 | if cgroup_flag == true { |
980 | group_count++ |
981 | } |
982 | |
983 | // calculate the group id |
984 | // if it is a named group, recycle the group id |
985 | // NOTE: **** the group index is +1 because map return 0 when not found!! **** |
986 | mut group_id := group_count |
987 | if cgroup_name.len > 0 { |
988 | // println("GROUP NAME: ${cgroup_name}") |
989 | if cgroup_name in re.group_map { |
990 | group_id = re.group_map[cgroup_name] - 1 |
991 | group_count-- |
992 | } else { |
993 | re.group_map[cgroup_name] = group_id + 1 |
994 | } |
995 | } |
996 | |
997 | group_stack_txt_index[group_stack_index] = i |
998 | group_stack[group_stack_index] = pc |
999 | |
1000 | re.prog[pc].ist = u32(0) | regex.ist_group_start |
1001 | re.prog[pc].rep_min = 1 |
1002 | re.prog[pc].rep_max = 1 |
1003 | |
1004 | // manage negation groups |
1005 | if negate_flag == true { |
1006 | re.prog[pc].group_neg = true |
1007 | re.prog[pc].rep_min = 0 // may be not catched, but it is ok |
1008 | } |
1009 | |
1010 | // set the group id |
1011 | if cgroup_flag == false { |
1012 | // println("NO CAPTURE GROUP") |
1013 | re.prog[pc].group_id = -1 |
1014 | } else { |
1015 | re.prog[pc].group_id = group_id |
1016 | } |
1017 | |
1018 | pc = pc + 1 |
1019 | continue |
1020 | } |
1021 | |
1022 | // ist_group_end |
1023 | if char_len == 1 && pc > 0 && u8(char_tmp) == `)` { |
1024 | if group_stack_index < 0 { |
1025 | return regex.err_group_not_balanced, i + 1 |
1026 | } |
1027 | |
1028 | goto_pc := group_stack[group_stack_index] |
1029 | group_stack_index-- |
1030 | |
1031 | re.prog[pc].ist = u32(0) | regex.ist_group_end |
1032 | re.prog[pc].rep_min = 1 |
1033 | re.prog[pc].rep_max = 1 |
1034 | |
1035 | re.prog[pc].goto_pc = goto_pc // PC where to jump if a group need |
1036 | re.prog[pc].group_id = re.prog[goto_pc].group_id // id of this group, used for storing data |
1037 | |
1038 | re.prog[goto_pc].goto_pc = pc // start goto point to the end group pc |
1039 | // re.prog[goto_pc].group_id = group_count // id of this group, used for storing data |
1040 | |
1041 | // duplicate the negation group info and settings |
1042 | if re.prog[goto_pc].group_neg == true { |
1043 | re.prog[pc].group_neg = re.prog[goto_pc].group_neg |
1044 | re.prog[pc].rep_min = re.prog[goto_pc].rep_min |
1045 | } |
1046 | |
1047 | pc = pc + 1 |
1048 | i = i + char_len |
1049 | continue |
1050 | } |
1051 | |
1052 | // ist_dot_char match any char except the following token |
1053 | if char_len == 1 && pc >= 0 && u8(char_tmp) == `.` { |
1054 | // consecutive ist_dot_char is a syntax error |
1055 | if pc > 0 && re.prog[pc - 1].ist == regex.ist_dot_char { |
1056 | return regex.err_consecutive_dots, i |
1057 | } |
1058 | |
1059 | re.prog[pc].ist = u32(0) | regex.ist_dot_char |
1060 | re.prog[pc].rep_min = 1 |
1061 | re.prog[pc].rep_max = 1 |
1062 | pc = pc + 1 |
1063 | i = i + char_len |
1064 | continue |
1065 | } |
1066 | |
1067 | // OR branch |
1068 | if char_len == 1 && pc > 0 && u8(char_tmp) == `|` { |
1069 | if pc > 0 && re.prog[pc - 1].ist == regex.ist_or_branch { |
1070 | return regex.err_syntax_error, i |
1071 | } |
1072 | re.prog[pc].ist = u32(0) | regex.ist_or_branch |
1073 | re.prog[pc].source_index = i |
1074 | pc = pc + 1 |
1075 | i = i + char_len |
1076 | continue |
1077 | } |
1078 | |
1079 | // Quantifiers |
1080 | if char_len == 1 && pc > 0 { |
1081 | mut char_next := rune(0) |
1082 | mut char_next_len := 0 |
1083 | if (char_len + i) < in_txt.len { |
1084 | char_next, char_next_len = re.get_char(in_txt, i + char_len) |
1085 | } |
1086 | mut quant_flag := true |
1087 | |
1088 | // negation groups can not have quantifiers |
1089 | if re.prog[pc - 1].group_neg == true && char_tmp in [`?`, `+`, `*`, `{`] { |
1090 | return regex.err_neg_group_quantifier, i |
1091 | } |
1092 | |
1093 | match u8(char_tmp) { |
1094 | `?` { |
1095 | // println("q: ${char_tmp:c}") |
1096 | // check illegal quantifier sequences |
1097 | if char_next_len == 1 && char_next in regex.quntifier_chars { |
1098 | return regex.err_syntax_error, i |
1099 | } |
1100 | re.prog[pc - 1].rep_min = 0 |
1101 | re.prog[pc - 1].rep_max = 1 |
1102 | } |
1103 | `+` { |
1104 | // println("q: ${char_tmp:c}") |
1105 | // check illegal quantifier sequences |
1106 | if char_next_len == 1 && char_next in regex.quntifier_chars { |
1107 | return regex.err_syntax_error, i |
1108 | } |
1109 | re.prog[pc - 1].rep_min = 1 |
1110 | re.prog[pc - 1].rep_max = regex.max_quantifier |
1111 | } |
1112 | `*` { |
1113 | // println("q: ${char_tmp:c}") |
1114 | // check illegal quantifier sequences |
1115 | if char_next_len == 1 && char_next in regex.quntifier_chars { |
1116 | return regex.err_syntax_error, i |
1117 | } |
1118 | re.prog[pc - 1].rep_min = 0 |
1119 | re.prog[pc - 1].rep_max = regex.max_quantifier |
1120 | } |
1121 | `{` { |
1122 | min, max, tmp, greedy := re.parse_quantifier(in_txt, i + 1) |
1123 | // it is a quantifier |
1124 | if min >= 0 { |
1125 | // println("{$min,$max}\n str:[${in_txt[i..i+tmp]}] greedy:$greedy") |
1126 | i = i + tmp |
1127 | re.prog[pc - 1].rep_min = min |
1128 | re.prog[pc - 1].rep_max = max |
1129 | re.prog[pc - 1].greedy = greedy |
1130 | // check illegal quantifier sequences |
1131 | if i <= in_txt.len { |
1132 | char_next, char_next_len = re.get_char(in_txt, i) |
1133 | if char_next_len == 1 && char_next in regex.quntifier_chars { |
1134 | return regex.err_syntax_error, i |
1135 | } |
1136 | } |
1137 | continue |
1138 | } else { |
1139 | return min, i |
1140 | } |
1141 | |
1142 | // TODO: decide if the open bracket can be conform without the close bracket |
1143 | /* |
1144 | // no conform, parse as normal char |
1145 | else { |
1146 | quant_flag = false |
1147 | } |
1148 | */ |
1149 | } |
1150 | else { |
1151 | quant_flag = false |
1152 | } |
1153 | } |
1154 | |
1155 | if quant_flag { |
1156 | i = i + char_len |
1157 | continue |
1158 | } |
1159 | } |
1160 | |
1161 | // IST_CHAR_CLASS_* |
1162 | if char_len == 1 && pc >= 0 { |
1163 | if u8(char_tmp) == `[` { |
1164 | cc_index, tmp, cc_type := re.parse_char_class(in_txt, i + 1) |
1165 | if cc_index >= 0 { |
1166 | // println("index: $cc_index str:${in_txt[i..i+tmp]}") |
1167 | i = i + tmp |
1168 | re.prog[pc].ist = u32(0) | cc_type |
1169 | re.prog[pc].cc_index = cc_index |
1170 | re.prog[pc].rep_min = 1 |
1171 | re.prog[pc].rep_max = 1 |
1172 | pc = pc + 1 |
1173 | continue |
1174 | } |
1175 | // cc_class vector memory full |
1176 | else if cc_index < 0 { |
1177 | return cc_index, i |
1178 | } |
1179 | } |
1180 | } |
1181 | |
1182 | // ist_bsls_char |
1183 | if char_len == 1 && pc >= 0 { |
1184 | if u8(char_tmp) == `\\` { |
1185 | bsls_index, tmp := re.parse_bsls(in_txt, i) |
1186 | // println("index: $bsls_index str:${in_txt[i..i+tmp]}") |
1187 | if bsls_index >= 0 { |
1188 | i = i + tmp |
1189 | re.prog[pc].ist = u32(0) | regex.ist_bsls_char |
1190 | re.prog[pc].rep_min = 1 |
1191 | re.prog[pc].rep_max = 1 |
1192 | re.prog[pc].validator = regex.bsls_validator_array[bsls_index].validator |
1193 | re.prog[pc].ch = regex.bsls_validator_array[bsls_index].ch |
1194 | pc = pc + 1 |
1195 | continue |
1196 | } |
1197 | // this is an escape char, skip the bsls and continue as a normal char |
1198 | else if bsls_index == regex.no_match_found { |
1199 | i += char_len |
1200 | char_tmp, char_len = re.get_char(in_txt, i) |
1201 | // continue as simple char |
1202 | } |
1203 | // if not an escape or a bsls char then it is an error (at least for now!) |
1204 | else { |
1205 | return bsls_index, i + tmp |
1206 | } |
1207 | } |
1208 | } |
1209 | |
1210 | // ist_simple_char |
1211 | re.prog[pc].ist = regex.ist_simple_char |
1212 | re.prog[pc].ch = char_tmp |
1213 | re.prog[pc].ch_len = u8(char_len) |
1214 | re.prog[pc].rep_min = 1 |
1215 | re.prog[pc].rep_max = 1 |
1216 | // println("char: ${char_tmp:c}") |
1217 | pc = pc + 1 |
1218 | |
1219 | i += char_len |
1220 | } |
1221 | |
1222 | // add end of the program |
1223 | re.prog[pc].ist = regex.ist_prog_end |
1224 | re.prog_len = pc |
1225 | |
1226 | // check for unbalanced groups |
1227 | if group_stack_index != -1 { |
1228 | return regex.err_group_not_balanced, group_stack_txt_index[group_stack_index] + 1 |
1229 | } |
1230 | |
1231 | // check for OR at the end of the program |
1232 | if pc > 0 && re.prog[pc - 1].ist == regex.ist_or_branch { |
1233 | return regex.err_syntax_error, in_txt.len - 1 |
1234 | } |
1235 | |
1236 | // store the number of groups in the query |
1237 | re.group_count = group_count + 1 |
1238 | |
1239 | //****************************************** |
1240 | // Post processing |
1241 | //****************************************** |
1242 | |
1243 | // |
1244 | // manage ist_dot_char |
1245 | // |
1246 | |
1247 | // find the checks for dot chars, if any... |
1248 | mut pc1 := 0 |
1249 | mut dot_char_count := 0 |
1250 | mut last_dot_char_pc := -1 |
1251 | for pc1 < pc { |
1252 | if re.prog[pc1].ist == regex.ist_dot_char { |
1253 | // println("Dot_char pc: $pc1") |
1254 | last_dot_char_pc = pc1 |
1255 | dot_char_count++ |
1256 | mut pc2 := pc1 + 1 |
1257 | for pc2 < pc { |
1258 | // consecutive dot chars is an error |
1259 | if re.prog[pc2].ist == regex.ist_dot_char { |
1260 | return regex.err_syntax_error, 0 |
1261 | } |
1262 | if re.prog[pc2].ist !in [rune(regex.ist_prog_end), regex.ist_group_end, |
1263 | regex.ist_group_start] { |
1264 | // println("Next dot char check is PC: ${pc2}") |
1265 | re.prog[pc1].dot_check_pc = pc2 |
1266 | break |
1267 | } |
1268 | pc2++ |
1269 | } |
1270 | } |
1271 | pc1++ |
1272 | } |
1273 | |
1274 | // println("last_dot_char_pc: ${last_dot_char_pc}") |
1275 | if last_dot_char_pc >= 0 { |
1276 | pc1 = last_dot_char_pc + 1 |
1277 | mut is_last_dot := true |
1278 | for pc1 < pc { |
1279 | if re.prog[pc1].ist !in [rune(regex.ist_prog_end), regex.ist_group_end] { |
1280 | is_last_dot = false |
1281 | break |
1282 | } |
1283 | pc1++ |
1284 | } |
1285 | if is_last_dot { |
1286 | re.prog[last_dot_char_pc].last_dot_flag = true |
1287 | } |
1288 | } |
1289 | |
1290 | // |
1291 | // manage bsls_char |
1292 | // |
1293 | |
1294 | // find the checks for bsls, if any... |
1295 | pc1 = 0 |
1296 | mut bsls_char_count := 0 |
1297 | mut last_bsls_char_pc := -1 |
1298 | for pc1 < pc { |
1299 | if re.prog[pc1].ist == regex.ist_bsls_char { |
1300 | // println("bsls_char pc: $pc1") |
1301 | last_bsls_char_pc = pc1 |
1302 | bsls_char_count++ |
1303 | mut pc2 := pc1 + 1 |
1304 | for pc2 < pc { |
1305 | if re.prog[pc2].ist !in [rune(regex.ist_prog_end), regex.ist_group_end, |
1306 | regex.ist_group_start] { |
1307 | // println("Next bsls check is PC: ${pc2}") |
1308 | re.prog[pc1].bsls_check_pc = pc2 |
1309 | break |
1310 | } |
1311 | pc2++ |
1312 | } |
1313 | } |
1314 | pc1++ |
1315 | } |
1316 | |
1317 | // println('last_bsls_char_pc: ${last_bsls_char_pc}') |
1318 | if last_bsls_char_pc >= 0 { |
1319 | pc1 = last_bsls_char_pc + 1 |
1320 | mut is_last_bsls := true |
1321 | for pc1 < pc { |
1322 | if re.prog[pc1].ist !in [rune(regex.ist_prog_end), regex.ist_group_end] { |
1323 | is_last_bsls = false |
1324 | break |
1325 | } |
1326 | pc1++ |
1327 | } |
1328 | if is_last_bsls { |
1329 | re.prog[last_bsls_char_pc].last_dot_flag = true |
1330 | } |
1331 | } |
1332 | |
1333 | // |
1334 | // manage CC |
1335 | // |
1336 | pc1 = 0 |
1337 | mut cc_char_count := 0 |
1338 | mut last_cc_char_pc := -1 |
1339 | for pc1 < pc { |
1340 | if re.prog[pc1].ist in [rune(regex.ist_char_class_pos), regex.ist_char_class_neg] { |
1341 | last_cc_char_pc = pc1 |
1342 | cc_char_count++ |
1343 | mut pc2 := pc1 + 1 |
1344 | for pc2 < pc { |
1345 | if re.prog[pc2].ist !in [rune(regex.ist_prog_end), regex.ist_group_end, |
1346 | regex.ist_group_start] { |
1347 | // println("Next CC check is PC: ${pc2}") |
1348 | re.prog[pc1].cc_check_pc = pc2 |
1349 | break |
1350 | } |
1351 | pc2++ |
1352 | } |
1353 | } |
1354 | pc1++ |
1355 | } |
1356 | |
1357 | // println('last_cc_char_pc: ${last_cc_char_pc}') |
1358 | if last_cc_char_pc >= 0 { |
1359 | pc1 = last_cc_char_pc + 1 |
1360 | mut is_last_cc := true |
1361 | for pc1 < pc { |
1362 | if re.prog[pc1].ist !in [rune(regex.ist_prog_end), regex.ist_group_end] { |
1363 | is_last_cc = false |
1364 | break |
1365 | } |
1366 | pc1++ |
1367 | } |
1368 | if is_last_cc { |
1369 | re.prog[last_cc_char_pc].last_dot_flag = true |
1370 | } |
1371 | } |
1372 | |
1373 | //****************************************** |
1374 | |
1375 | // OR branch |
1376 | // a|b|cd |
1377 | // d exit point |
1378 | // a,b,c branches |
1379 | // set the jump in the right places |
1380 | pc1 = 0 |
1381 | for pc1 < pc - 2 { |
1382 | // println("Here $pc1 ${pc-2}") |
1383 | // println("source index: ${pc1 + 1} => ${re.prog[pc1+1].source_index}") |
1384 | if re.prog[pc1 + 1].ist == regex.ist_or_branch { |
1385 | // two consecutive OR are a syntax error |
1386 | if re.prog[pc1 + 2].ist == regex.ist_or_branch { |
1387 | return regex.err_syntax_error, i |
1388 | } |
1389 | |
1390 | // check for []|[] errors |
1391 | if re.prog[pc1].ist == regex.ist_char_class_pos |
1392 | && re.prog[pc1 + 2].ist == regex.ist_char_class_pos { |
1393 | return regex.err_invalid_or_with_cc, re.prog[pc1 + 1].source_index |
1394 | } |
1395 | } |
1396 | |
1397 | // manange a|b chains like a|(b)|c|d... |
1398 | // standard solution |
1399 | if re.prog[pc1].ist != regex.ist_or_branch && re.prog[pc1 + 1].ist == regex.ist_or_branch |
1400 | && re.prog[pc1 + 2].ist != regex.ist_or_branch { |
1401 | re.prog[pc1].next_is_or = true // set that the next token is an OR |
1402 | re.prog[pc1 + 1].rep_min = pc1 + 2 // failed match jump |
1403 | |
1404 | // match jump, if an OR chain the next token will be an OR token |
1405 | mut pc2 := pc1 + 2 |
1406 | for pc2 < pc - 1 { |
1407 | ist := re.prog[pc2].ist |
1408 | if ist == regex.ist_group_start { |
1409 | re.prog[pc1 + 1].rep_max = re.prog[pc2].goto_pc + 1 |
1410 | break |
1411 | } |
1412 | if ist != regex.ist_or_branch { |
1413 | re.prog[pc1 + 1].rep_max = pc2 + 1 |
1414 | break |
1415 | } |
1416 | |
1417 | pc2++ |
1418 | } |
1419 | // special case query of few chars, the true can't go on the first instruction |
1420 | if re.prog[pc1 + 1].rep_max == pc1 { |
1421 | re.prog[pc1 + 1].rep_max = 3 |
1422 | } |
1423 | // println("Compile OR postproc. [$pc1,OR ${pc1+1},$pc2]") |
1424 | pc1 = pc2 |
1425 | continue |
1426 | } |
1427 | |
1428 | pc1++ |
1429 | } |
1430 | |
1431 | //****************************************** |
1432 | // DEBUG PRINT REGEX GENERATED CODE |
1433 | //****************************************** |
1434 | if re.debug > 0 { |
1435 | gc := re.get_code() |
1436 | re.log_func(gc) |
1437 | } |
1438 | //****************************************** |
1439 | |
1440 | return regex.compile_ok, 0 |
1441 | } |
1442 | |
1443 | // get_code return the compiled code as regex string, note: may be different from the source! |
1444 | pub fn (re RE) get_code() string { |
1445 | mut pc1 := 0 |
1446 | mut res := strings.new_builder(re.cc.len * 2 * re.prog.len) |
1447 | res.write_string('========================================\nv RegEx compiler v ${regex.v_regex_version} output:\n') |
1448 | |
1449 | mut stop_flag := false |
1450 | |
1451 | for pc1 <= re.prog.len { |
1452 | tk := re.prog[pc1] |
1453 | res.write_string('PC:${pc1:3d}') |
1454 | |
1455 | res.write_string(' ist: ') |
1456 | res.write_string('${tk.ist:8x}'.replace(' ', '0')) |
1457 | res.write_string(' ') |
1458 | ist := tk.ist |
1459 | if ist == regex.ist_bsls_char { |
1460 | res.write_string('[\\${tk.ch:1c}] BSLS') |
1461 | if tk.last_dot_flag == true { |
1462 | res.write_string(' last!') |
1463 | } |
1464 | } else if ist == regex.ist_prog_end { |
1465 | res.write_string('PROG_END') |
1466 | stop_flag = true |
1467 | } else if ist == regex.ist_or_branch { |
1468 | res.write_string('OR ') |
1469 | } else if ist == regex.ist_char_class_pos { |
1470 | res.write_string('[${re.get_char_class(pc1)}] CHAR_CLASS_POS') |
1471 | if tk.last_dot_flag == true { |
1472 | res.write_string(' last!') |
1473 | } |
1474 | } else if ist == regex.ist_char_class_neg { |
1475 | res.write_string('[^${re.get_char_class(pc1)}] CHAR_CLASS_NEG') |
1476 | if tk.last_dot_flag == true { |
1477 | res.write_string(' last!') |
1478 | } |
1479 | } else if ist == regex.ist_dot_char { |
1480 | res.write_string('. DOT_CHAR nx chk: ${tk.dot_check_pc}') |
1481 | if tk.last_dot_flag == true { |
1482 | res.write_string(' last!') |
1483 | } |
1484 | } else if ist == regex.ist_group_start { |
1485 | res.write_string('( GROUP_START #:${tk.group_id}') |
1486 | if tk.group_id == -1 { |
1487 | res.write_string(' ?:') |
1488 | } else { |
1489 | for x in re.group_map.keys() { |
1490 | if re.group_map[x] == (tk.group_id + 1) { |
1491 | res.write_string(' ?P<${x}>') |
1492 | break |
1493 | } |
1494 | } |
1495 | } |
1496 | } else if ist == regex.ist_group_end { |
1497 | res.write_string(') GROUP_END #:${tk.group_id}') |
1498 | } else if ist == regex.ist_simple_char { |
1499 | res.write_string('[${tk.ch:1c}] query_ch') |
1500 | } |
1501 | |
1502 | if tk.rep_max == regex.max_quantifier { |
1503 | res.write_string(' {${tk.rep_min:3d},MAX}') |
1504 | } else { |
1505 | if ist == regex.ist_or_branch { |
1506 | res.write_string(' if false go: ${tk.rep_min:3d} if true go: ${tk.rep_max:3d}') |
1507 | } else { |
1508 | res.write_string(' {${tk.rep_min:3d},${tk.rep_max:3d}}') |
1509 | } |
1510 | if tk.greedy == true { |
1511 | res.write_string('?') |
1512 | } |
1513 | } |
1514 | |
1515 | res.write_string('\n') |
1516 | if stop_flag { |
1517 | break |
1518 | } |
1519 | pc1++ |
1520 | } |
1521 | |
1522 | res.write_string('========================================\n') |
1523 | return res.str() |
1524 | } |
1525 | |
1526 | // get_query return a string with a reconstruction of the query starting from the regex program code |
1527 | pub fn (re RE) get_query() string { |
1528 | mut res := strings.new_builder(re.query.len * 2) |
1529 | |
1530 | if (re.flag & regex.f_ms) != 0 { |
1531 | res.write_string('^') |
1532 | } |
1533 | |
1534 | mut i := 0 |
1535 | for i < re.prog.len && re.prog[i].ist != regex.ist_prog_end && re.prog[i].ist != 0 { |
1536 | tk := unsafe { &re.prog[i] } |
1537 | ch := tk.ist |
1538 | |
1539 | // GROUP start |
1540 | if ch == regex.ist_group_start { |
1541 | if re.debug > 0 { |
1542 | res.write_string('#${tk.group_id}') |
1543 | } |
1544 | res.write_string('(') |
1545 | |
1546 | if tk.group_neg == true { |
1547 | res.write_string('?!') // negation group |
1548 | } else if tk.group_id == -1 { |
1549 | res.write_string('?:') // non capturing group |
1550 | } |
1551 | |
1552 | for x in re.group_map.keys() { |
1553 | if re.group_map[x] == (tk.group_id + 1) { |
1554 | res.write_string('?P<${x}>') |
1555 | break |
1556 | } |
1557 | } |
1558 | |
1559 | i++ |
1560 | continue |
1561 | } |
1562 | |
1563 | // GROUP end |
1564 | if ch == regex.ist_group_end { |
1565 | res.write_string(')') |
1566 | } |
1567 | |
1568 | // OR branch |
1569 | if ch == regex.ist_or_branch { |
1570 | res.write_string('|') |
1571 | if re.debug > 0 { |
1572 | res.write_string('{${tk.rep_min},${tk.rep_max}}') |
1573 | } |
1574 | i++ |
1575 | continue |
1576 | } |
1577 | |
1578 | // char class |
1579 | if ch == regex.ist_char_class_neg || ch == regex.ist_char_class_pos { |
1580 | res.write_string('[') |
1581 | if ch == regex.ist_char_class_neg { |
1582 | res.write_string('^') |
1583 | } |
1584 | res.write_string('${re.get_char_class(i)}') |
1585 | res.write_string(']') |
1586 | } |
1587 | |
1588 | // bsls char |
1589 | if ch == regex.ist_bsls_char { |
1590 | res.write_string('\\${tk.ch:1c}') |
1591 | } |
1592 | |
1593 | // ist_dot_char |
1594 | if ch == regex.ist_dot_char { |
1595 | res.write_string('.') |
1596 | } |
1597 | |
1598 | // char alone |
1599 | if ch == regex.ist_simple_char { |
1600 | if u8(ch) in regex.bsls_escape_list { |
1601 | res.write_string('\\') |
1602 | } |
1603 | res.write_string('${tk.ch:c}') |
1604 | } |
1605 | |
1606 | // quantifier |
1607 | if !(tk.rep_min == 1 && tk.rep_max == 1) && tk.group_neg == false { |
1608 | if tk.rep_min == 0 && tk.rep_max == 1 { |
1609 | res.write_string('?') |
1610 | } else if tk.rep_min == 1 && tk.rep_max == regex.max_quantifier { |
1611 | res.write_string('+') |
1612 | } else if tk.rep_min == 0 && tk.rep_max == regex.max_quantifier { |
1613 | res.write_string('*') |
1614 | } else { |
1615 | if tk.rep_max == regex.max_quantifier { |
1616 | res.write_string('{${tk.rep_min},MAX}') |
1617 | } else { |
1618 | res.write_string('{${tk.rep_min},${tk.rep_max}}') |
1619 | } |
1620 | if tk.greedy == true { |
1621 | res.write_string('?') |
1622 | } |
1623 | } |
1624 | } |
1625 | i++ |
1626 | } |
1627 | if (re.flag & regex.f_me) != 0 { |
1628 | res.write_string('$') |
1629 | } |
1630 | |
1631 | return res.str() |
1632 | } |
1633 | |
1634 | /****************************************************************************** |
1635 | * |
1636 | * Groups saving utilities |
1637 | * |
1638 | ******************************************************************************/ |
1639 | [direct_array_access] |
1640 | fn (mut re RE) group_continuous_save(g_index int) { |
1641 | if re.group_csave_flag == true { |
1642 | // continuous save, save until we have space |
1643 | |
1644 | // init the first element as counter |
1645 | if re.group_csave.len == 0 { |
1646 | re.group_csave << 0 |
1647 | } |
1648 | |
1649 | gi := g_index >> 1 |
1650 | start := re.groups[g_index] |
1651 | end := re.groups[g_index + 1] |
1652 | |
1653 | // check if we are simply increasing the size ot the found group |
1654 | if re.group_csave.len >= 4 && gi == re.group_csave[re.group_csave.len - 3] |
1655 | && start == re.group_csave[re.group_csave.len - 2] { |
1656 | re.group_csave[re.group_csave.len - 1] = end |
1657 | return |
1658 | } |
1659 | |
1660 | // otherwise append a new group to the list |
1661 | |
1662 | // increment counter |
1663 | re.group_csave[0]++ |
1664 | // save the record |
1665 | re.group_csave << (g_index >> 1) // group id |
1666 | re.group_csave << re.groups[g_index] // start |
1667 | re.group_csave << re.groups[g_index + 1] // end |
1668 | } |
1669 | } |
1670 | |
1671 | /****************************************************************************** |
1672 | * |
1673 | * Matching |
1674 | * |
1675 | ******************************************************************************/ |
1676 | enum Match_state { |
1677 | start = 0 |
1678 | stop |
1679 | end |
1680 | new_line |
1681 | ist_load // load and execute instruction |
1682 | ist_next // go to next instruction |
1683 | ist_next_ks // go to next instruction without clenaning the state |
1684 | ist_quant_p // match positive ,quantifier check |
1685 | ist_quant_n // match negative, quantifier check |
1686 | ist_quant_pg // match positive ,group quantifier check |
1687 | ist_quant_ng // match negative ,group quantifier check |
1688 | } |
1689 | |
1690 | fn state_str(s Match_state) string { |
1691 | match s { |
1692 | .start { return 'start' } |
1693 | .stop { return 'stop' } |
1694 | .end { return 'end' } |
1695 | .new_line { return 'new line' } |
1696 | .ist_load { return 'ist_load' } |
1697 | .ist_next { return 'ist_next' } |
1698 | .ist_next_ks { return 'ist_next_ks' } |
1699 | .ist_quant_p { return 'ist_quant_p' } |
1700 | .ist_quant_n { return 'ist_quant_n' } |
1701 | .ist_quant_pg { return 'ist_quant_pg' } |
1702 | .ist_quant_ng { return 'ist_quant_ng' } |
1703 | } |
1704 | } |
1705 | |
1706 | struct StateObj { |
1707 | pub mut: |
1708 | group_index int = -1 // group id used to know how many groups are open |
1709 | match_flag bool // indicate if we are in a match condition |
1710 | match_index int = -1 // index of the last match |
1711 | first_match int = -1 // index of the first match |
1712 | pc int = -1 // program counter |
1713 | i int = -1 // source string index |
1714 | char_len int // last char legth |
1715 | last_dot_pc int = -1 // last dot chat pc |
1716 | } |
1717 | |
1718 | [direct_array_access] |
1719 | pub fn (mut re RE) match_base(in_txt &u8, in_txt_len int) (int, int) { |
1720 | // result status |
1721 | mut result := regex.no_match_found // function return |
1722 | |
1723 | mut ch := rune(0) // examinated char |
1724 | mut char_len := 0 // utf8 examinated char len |
1725 | mut m_state := Match_state.start // start point for the matcher FSM |
1726 | mut src_end := false |
1727 | mut last_fnd_pc := -1 |
1728 | |
1729 | mut state := StateObj{} // actual state |
1730 | mut ist := rune(0) // actual instruction |
1731 | mut l_ist := rune(0) // last matched instruction |
1732 | |
1733 | mut step_count := 0 // stats for debug |
1734 | mut dbg_line := 0 // count debug line printed |
1735 | |
1736 | re.reset() |
1737 | |
1738 | if re.debug > 0 { |
1739 | // print header |
1740 | mut h_buf := strings.new_builder(32) |
1741 | h_buf.write_string('flags: ') |
1742 | h_buf.write_string('${re.flag:8x}'.replace(' ', '0')) |
1743 | h_buf.write_string('\n') |
1744 | sss := h_buf.str() |
1745 | re.log_func(sss) |
1746 | } |
1747 | |
1748 | for m_state != .end { |
1749 | if state.pc >= 0 && state.pc < re.prog.len { |
1750 | ist = re.prog[state.pc].ist |
1751 | } else if state.pc >= re.prog.len { |
1752 | // println("ERROR!! PC overflow!!") |
1753 | return regex.err_internal_error, state.i |
1754 | } |
1755 | |
1756 | //****************************************** |
1757 | // DEBUG LOG |
1758 | //****************************************** |
1759 | if re.debug > 0 { |
1760 | mut buf2 := strings.new_builder(re.cc.len + 128) |
1761 | |
1762 | // print all the instructions |
1763 | |
1764 | // end of the input text |
1765 | if state.i >= in_txt_len { |
1766 | buf2.write_string('# ${step_count:3d} END OF INPUT TEXT\n') |
1767 | sss := buf2.str() |
1768 | re.log_func(sss) |
1769 | } else { |
1770 | // print only the exe instruction |
1771 | if (re.debug == 1 && m_state == .ist_load) || re.debug == 2 { |
1772 | if ist == regex.ist_prog_end { |
1773 | buf2.write_string('# ${step_count:3d} PROG_END\n') |
1774 | } else if ist == 0 || m_state in [.start, .ist_next, .stop] { |
1775 | buf2.write_string('# ${step_count:3d} s: ${state_str(m_state):12s} PC: NA\n') |
1776 | } else { |
1777 | ch, char_len = re.get_charb(in_txt, state.i) |
1778 | |
1779 | buf2.write_string('# ${step_count:3d} s: ${state_str(m_state):12s} PC: ${state.pc:3d}=>') |
1780 | buf2.write_string('${ist:8x}'.replace(' ', '0')) |
1781 | buf2.write_string(" i,ch,len:[${state.i:3d},'${utf8_str(ch)}',${char_len}] f.m:[${state.first_match:3d},${state.match_index:3d}] ") |
1782 | |
1783 | if ist == regex.ist_simple_char { |
1784 | buf2.write_string('query_ch: [${re.prog[state.pc].ch:1c}]') |
1785 | } else { |
1786 | if ist == regex.ist_bsls_char { |
1787 | buf2.write_string('BSLS [\\${re.prog[state.pc].ch:1c}]') |
1788 | } else if ist == regex.ist_prog_end { |
1789 | buf2.write_string('PROG_END') |
1790 | } else if ist == regex.ist_or_branch { |
1791 | buf2.write_string('OR') |
1792 | } else if ist == regex.ist_char_class_pos { |
1793 | buf2.write_string('CHAR_CLASS_POS[${re.get_char_class(state.pc)}]') |
1794 | } else if ist == regex.ist_char_class_neg { |
1795 | buf2.write_string('CHAR_CLASS_NEG[${re.get_char_class(state.pc)}]') |
1796 | } else if ist == regex.ist_dot_char { |
1797 | buf2.write_string('DOT_CHAR') |
1798 | } else if ist == regex.ist_group_start { |
1799 | tmp_gi := re.prog[state.pc].group_id |
1800 | tmp_gr := re.prog[re.prog[state.pc].goto_pc].group_rep |
1801 | buf2.write_string('GROUP_START #:${tmp_gi} rep:${tmp_gr} ') |
1802 | } else if ist == regex.ist_group_end { |
1803 | buf2.write_string('GROUP_END #:${re.prog[state.pc].group_id} deep:${state.group_index}') |
1804 | } |
1805 | } |
1806 | if re.prog[state.pc].rep_max == regex.max_quantifier { |
1807 | buf2.write_string('{${re.prog[state.pc].rep_min},MAX}:${re.prog[state.pc].rep}') |
1808 | } else { |
1809 | buf2.write_string('{${re.prog[state.pc].rep_min},${re.prog[state.pc].rep_max}}:${re.prog[state.pc].rep}') |
1810 | } |
1811 | if re.prog[state.pc].greedy == true { |
1812 | buf2.write_string('?') |
1813 | } |
1814 | buf2.write_string(' (#${state.group_index})') |
1815 | |
1816 | if ist == regex.ist_dot_char { |
1817 | buf2.write_string(' last!') |
1818 | } |
1819 | |
1820 | buf2.write_string('\n') |
1821 | } |
1822 | sss2 := buf2.str() |
1823 | re.log_func(sss2) |
1824 | } |
1825 | } |
1826 | step_count++ |
1827 | dbg_line++ |
1828 | } |
1829 | //****************************************** |
1830 | |
1831 | if ist == regex.ist_prog_end { |
1832 | // println("HERE we end!") |
1833 | break |
1834 | } |
1835 | |
1836 | // we're out of text, manage it |
1837 | if state.i >= in_txt_len || m_state == .new_line { |
1838 | // println("Finished text!!") |
1839 | src_end = true |
1840 | |
1841 | // we have fished the text, we must manage out pf bound indexes |
1842 | if state.i >= in_txt_len { |
1843 | state.i = in_txt_len - 1 |
1844 | } |
1845 | |
1846 | // manage groups |
1847 | if state.group_index >= 0 && state.match_index >= 0 { |
1848 | // println("End text with open groups!") |
1849 | // println("state.group_index: ${state.group_index}") |
1850 | // close the groups |
1851 | for state.group_index >= 0 { |
1852 | tmp_pc := re.group_data[state.group_index] |
1853 | re.prog[tmp_pc].group_rep++ |
1854 | // println("Closing group $state.group_index {${re.prog[tmp_pc].rep_min},${re.prog[tmp_pc].rep_max}}:${re.prog[tmp_pc].group_rep}") |
1855 | |
1856 | if re.prog[tmp_pc].group_rep >= re.prog[tmp_pc].rep_min |
1857 | && re.prog[tmp_pc].group_id >= 0 { |
1858 | start_i := re.group_stack[state.group_index] |
1859 | re.group_stack[state.group_index] = -1 |
1860 | |
1861 | // save group results |
1862 | g_index := re.prog[tmp_pc].group_id * 2 |
1863 | // println("group_id: ${re.prog[tmp_pc].group_id} g_index: ${g_index}") |
1864 | if start_i >= 0 { |
1865 | re.groups[g_index] = start_i |
1866 | } else { |
1867 | re.groups[g_index] = 0 |
1868 | } |
1869 | |
1870 | re.groups[g_index + 1] = state.i |
1871 | |
1872 | if re.groups[g_index + 1] >= in_txt_len { |
1873 | // println("clamp group on stop!") |
1874 | re.groups[g_index + 1] = in_txt_len - 1 |
1875 | } |
1876 | |
1877 | // continuous save, save until we have space |
1878 | re.group_continuous_save(g_index) |
1879 | } |
1880 | state.group_index-- |
1881 | } |
1882 | } |
1883 | |
1884 | // println("re.groups: ${re.groups}") |
1885 | |
1886 | // the text is finished and the groups closed and we are the last group, ok exit |
1887 | if ist == regex.ist_group_end && re.prog[state.pc + 1].ist == regex.ist_prog_end { |
1888 | // println("Last group end") |
1889 | return state.first_match, state.i |
1890 | } |
1891 | |
1892 | if state.pc == -1 { |
1893 | state.pc = last_fnd_pc |
1894 | } |
1895 | |
1896 | // println("Finished text!!") |
1897 | // println("Instruction: ${ist:08x} pc: $state.pc") |
1898 | // println("min_rep: ${re.prog[state.pc].rep_min} max_rep: ${re.prog[state.pc].rep_max} rep: ${re.prog[state.pc].rep}") |
1899 | |
1900 | // program end |
1901 | if ist == regex.ist_prog_end { |
1902 | // println("Program end on end of text!") |
1903 | return state.first_match, state.i |
1904 | } |
1905 | |
1906 | if l_ist in [ |
1907 | rune(regex.ist_char_class_neg), |
1908 | regex.ist_char_class_pos, |
1909 | regex.ist_bsls_char, |
1910 | regex.ist_dot_char, |
1911 | ] { |
1912 | // println("***** We have a last special token") |
1913 | // println("PC: ${state.pc} last_dot_flag:${re.prog[state.pc].last_dot_flag}") |
1914 | // println("rep: ${re.prog[state.pc].group_rep} min: ${re.prog[state.pc].rep_min} max: ${re.prog[state.pc].rep_max}") |
1915 | // println("first match: ${state.first_match}") |
1916 | |
1917 | if re.prog[state.pc].last_dot_flag == true |
1918 | && re.prog[state.pc].rep >= re.prog[state.pc].rep_min |
1919 | && re.prog[state.pc].rep <= re.prog[state.pc].rep_max { |
1920 | return state.first_match, state.i |
1921 | } |
1922 | // println("Not fitted!!") |
1923 | } |
1924 | // no groups open, check the last token quantifier |
1925 | if ist != regex.ist_group_end && re.prog[state.pc + 1].ist == regex.ist_prog_end { |
1926 | if re.prog[state.pc].rep >= re.prog[state.pc].rep_min |
1927 | && re.prog[state.pc].rep <= re.prog[state.pc].rep_max { |
1928 | // println("We are in good repetition") |
1929 | return state.first_match, state.i |
1930 | } |
1931 | } |
1932 | |
1933 | // println("No good exit!!") |
1934 | if re.prog[re.prog_len - 1].ist == regex.ist_group_end { |
1935 | // println("last ist is a group end!") |
1936 | if re.prog[re.prog_len - 1].group_rep >= re.prog[re.prog_len - 1].rep_min { |
1937 | return state.first_match, state.i |
1938 | } |
1939 | } |
1940 | return regex.no_match_found, state.i |
1941 | } |
1942 | |
1943 | // starting and init |
1944 | if m_state == .start { |
1945 | state.pc = -1 |
1946 | state.i = 0 |
1947 | m_state = .ist_next |
1948 | continue |
1949 | } |
1950 | // ist_next, next instruction reseting its state |
1951 | else if m_state == .ist_next { |
1952 | state.pc = state.pc + 1 |
1953 | re.prog[state.pc].reset() |
1954 | // check if we are in the program bounds |
1955 | if state.pc < 0 || state.pc > re.prog.len { |
1956 | // println("ERROR!! PC overflow!!") |
1957 | return regex.err_internal_error, state.i |
1958 | } |
1959 | m_state = .ist_load |
1960 | continue |
1961 | } |
1962 | // ist_next_ks, next instruction keeping its state |
1963 | else if m_state == .ist_next_ks { |
1964 | state.pc = state.pc + 1 |
1965 | // check if we are in the program bounds |
1966 | if state.pc < 0 || state.pc > re.prog.len { |
1967 | // println("ERROR!! PC overflow!!") |
1968 | return regex.err_internal_error, state.i |
1969 | } |
1970 | m_state = .ist_load |
1971 | continue |
1972 | } |
1973 | |
1974 | // load the char |
1975 | ch, char_len = re.get_charb(in_txt, state.i) |
1976 | |
1977 | // check new line if flag f_nl enabled |
1978 | if (re.flag & regex.f_nl) != 0 && char_len == 1 && u8(ch) in regex.new_line_list { |
1979 | m_state = .new_line |
1980 | continue |
1981 | } |
1982 | // check if stop |
1983 | else if m_state == .stop { |
1984 | // we are in search mode, don't exit until the end |
1985 | if (re.flag & regex.f_src) != 0 && ist != regex.ist_prog_end { |
1986 | last_fnd_pc = state.pc |
1987 | state.pc = -1 |
1988 | state.i += char_len |
1989 | |
1990 | m_state = .ist_next |
1991 | re.reset_src() |
1992 | state.match_index = -1 |
1993 | state.first_match = -1 |
1994 | |
1995 | // reset state list |
1996 | re.reset() |
1997 | |
1998 | continue |
1999 | } |
2000 | |
2001 | if ist == regex.ist_prog_end { |
2002 | return state.first_match, state.i |
2003 | } |
2004 | |
2005 | // manage here dot char |
2006 | |
2007 | if re.state_list.len > 0 { |
2008 | // println("Here we are, with stop: state buffer: [${re.state_list.len}]") |
2009 | state = re.state_list.pop() |
2010 | |
2011 | state.match_flag = true |
2012 | l_ist = u32(regex.ist_dot_char) |
2013 | |
2014 | if state.first_match < 0 { |
2015 | state.first_match = state.i |
2016 | } |
2017 | state.match_index = state.i |
2018 | re.prog[state.pc].rep++ // increase repetitions |
2019 | |
2020 | state.i += char_len |
2021 | m_state = .ist_quant_p |
2022 | continue |
2023 | } |
2024 | |
2025 | // exit on no match |
2026 | return result, state.i |
2027 | } |
2028 | // ist_load |
2029 | else if m_state == .ist_load { |
2030 | // program end |
2031 | if ist == regex.ist_prog_end { |
2032 | // if we are in match exit well |
2033 | |
2034 | if state.group_index >= 0 && state.match_index >= 0 { |
2035 | state.group_index = -1 |
2036 | } |
2037 | |
2038 | m_state = .stop |
2039 | continue |
2040 | } |
2041 | // check GROUP start, no quantifier is checkd for this token!! |
2042 | else if ist == regex.ist_group_start { |
2043 | state.group_index++ |
2044 | re.group_data[state.group_index] = re.prog[state.pc].goto_pc // save where is ist_group_end, we will use it for escape |
2045 | re.group_stack[state.group_index] = state.i // index where we start to manage |
2046 | // println("group_index $state.group_index rep ${re.prog[re.prog[state.pc].goto_pc].group_rep}") |
2047 | |
2048 | m_state = .ist_next |
2049 | continue |
2050 | } |
2051 | // check GROUP end |
2052 | else if ist == regex.ist_group_end { |
2053 | // we are in matching streak |
2054 | // println("Group END!! last ist: ${l_ist:08x}") |
2055 | if state.match_index >= 0 { |
2056 | // restore txt index stack and save the group data |
2057 | |
2058 | // println("g.id: ${re.prog[state.pc].group_id} group_index: ${state.group_index}") |
2059 | if state.group_index >= 0 && re.prog[state.pc].group_id >= 0 { |
2060 | start_i := re.group_stack[state.group_index] |
2061 | |
2062 | // save group results |
2063 | g_index := re.prog[state.pc].group_id * 2 |
2064 | |
2065 | if start_i >= 0 { |
2066 | re.groups[g_index] = start_i |
2067 | } else { |
2068 | re.groups[g_index] = 0 |
2069 | } |
2070 | |
2071 | re.groups[g_index + 1] = state.i |
2072 | |
2073 | if g_index > 0 && re.groups[g_index] <= re.groups[g_index - 1] { |
2074 | re.groups[g_index] = re.groups[g_index - 1] |
2075 | } |
2076 | |
2077 | if re.groups[g_index + 1] >= in_txt_len { |
2078 | // println("clamp group!") |
2079 | re.groups[g_index + 1] = in_txt_len - 1 |
2080 | } |
2081 | |
2082 | // println("GROUP ${re.prog[state.pc].group_id} END [${re.groups[g_index]}, ${re.groups[g_index+1]}] i: $state.i in_txt_len: $in_txt_len") |
2083 | |
2084 | // continuous save, save until we have space |
2085 | re.group_continuous_save(g_index) |
2086 | } |
2087 | |
2088 | re.prog[state.pc].group_rep++ // increase repetitions |
2089 | // println("GROUP $group_index END ${re.prog[state.pc].group_rep}") |
2090 | if re.prog[state.pc].group_rep > in_txt_len - 1 { |
2091 | m_state = .ist_quant_ng |
2092 | continue |
2093 | } |
2094 | |
2095 | m_state = .ist_quant_pg |
2096 | continue |
2097 | } |
2098 | |
2099 | m_state = .ist_quant_ng |
2100 | continue |
2101 | } |
2102 | // check OR |
2103 | else if ist == regex.ist_or_branch { |
2104 | if state.match_index >= 0 { |
2105 | state.pc = re.prog[state.pc].rep_max |
2106 | // println("ist_or_branch True pc: $state.pc") |
2107 | } else { |
2108 | state.pc = re.prog[state.pc].rep_min |
2109 | // println("ist_or_branch False pc: $state.pc") |
2110 | } |
2111 | re.prog[state.pc].reset() |
2112 | m_state = .ist_load |
2113 | continue |
2114 | } |
2115 | // check ist_dot_char |
2116 | else if ist == regex.ist_dot_char { |
2117 | // println("ist_dot_char rep: ${re.prog[state.pc].rep}") |
2118 | |
2119 | // check next token to be false |
2120 | mut next_check_flag := false |
2121 | |
2122 | // if we are done with max go on dot char are dedicated case!! |
2123 | if re.prog[state.pc].rep >= re.prog[state.pc].rep_max { |
2124 | re.state_list.pop() |
2125 | m_state = .ist_next |
2126 | continue |
2127 | } |
2128 | |
2129 | if re.prog[state.pc].last_dot_flag == false && re.prog[state.pc].dot_check_pc >= 0 |
2130 | && re.prog[state.pc].rep >= re.prog[state.pc].rep_min { |
2131 | // load the char |
2132 | // ch_t, _ := re.get_charb(in_txt, state.i+char_len) |
2133 | ch_t := ch |
2134 | chk_pc := re.prog[state.pc].dot_check_pc |
2135 | |
2136 | // simple char |
2137 | if re.prog[chk_pc].ist == regex.ist_simple_char { |
2138 | if re.prog[chk_pc].ch == ch_t { |
2139 | next_check_flag = true |
2140 | } |
2141 | // println("Check [ist_simple_char] [${re.prog[chk_pc].ch}]==[${ch_t:c}] => $next_check_flag") |
2142 | } |
2143 | // char char_class |
2144 | else if re.prog[chk_pc].ist == regex.ist_char_class_pos |
2145 | || re.prog[chk_pc].ist == regex.ist_char_class_neg { |
2146 | mut cc_neg := false |
2147 | if re.prog[chk_pc].ist == regex.ist_char_class_neg { |
2148 | cc_neg = true |
2149 | } |
2150 | mut cc_res := re.check_char_class(chk_pc, ch_t) |
2151 | |
2152 | if cc_neg { |
2153 | cc_res = !cc_res |
2154 | } |
2155 | next_check_flag = cc_res |
2156 | // println("Check [ist_char_class] => $next_check_flag") |
2157 | } |
2158 | // check bsls |
2159 | else if re.prog[chk_pc].ist == regex.ist_bsls_char { |
2160 | next_check_flag = re.prog[chk_pc].validator(u8(ch_t)) |
2161 | // println("Check [ist_bsls_char] => $next_check_flag") |
2162 | } |
2163 | } |
2164 | |
2165 | // check if we must continue or pass to the next IST |
2166 | if next_check_flag == true && re.prog[state.pc + 1].ist != regex.ist_prog_end { |
2167 | // println("save the state!!") |
2168 | mut dot_state := StateObj{ |
2169 | group_index: state.group_index |
2170 | match_flag: state.match_flag |
2171 | match_index: state.match_index |
2172 | first_match: state.first_match |
2173 | pc: state.pc |
2174 | i: state.i + char_len |
2175 | char_len: char_len |
2176 | last_dot_pc: state.pc |
2177 | } |
2178 | // if we are mananging a .* stay on the same char on return |
2179 | if re.prog[state.pc].rep_min == 0 { |
2180 | dot_state.i -= char_len |
2181 | } |
2182 | |
2183 | re.state_list << dot_state |
2184 | |
2185 | m_state = .ist_quant_n |
2186 | // println("dot_char stack len: ${re.state_list.len}") |
2187 | continue |
2188 | } |
2189 | |
2190 | state.match_flag = true |
2191 | l_ist = u32(regex.ist_dot_char) |
2192 | |
2193 | if state.first_match < 0 { |
2194 | state.first_match = state.i |
2195 | } |
2196 | state.match_index = state.i |
2197 | re.prog[state.pc].rep++ // increase repetitions |
2198 | |
2199 | state.i += char_len |
2200 | m_state = .ist_quant_p |
2201 | continue |
2202 | } |
2203 | // char class IST |
2204 | else if ist == regex.ist_char_class_pos || ist == regex.ist_char_class_neg { |
2205 | // check next token to be false |
2206 | mut next_check_flag := false |
2207 | |
2208 | // if we are done with max go on dot char are dedicated case!! |
2209 | if re.prog[state.pc].rep >= re.prog[state.pc].rep_max { |
2210 | re.state_list.pop() |
2211 | m_state = .ist_next |
2212 | continue |
2213 | } |
2214 | |
2215 | if re.prog[state.pc].last_dot_flag == false && re.prog[state.pc].cc_check_pc >= 0 |
2216 | && re.prog[state.pc].rep >= re.prog[state.pc].rep_min { |
2217 | // load the char |
2218 | // ch_t, _ := re.get_charb(in_txt, state.i+char_len) |
2219 | ch_t := ch |
2220 | chk_pc := re.prog[state.pc].cc_check_pc |
2221 | |
2222 | // simple char |
2223 | if re.prog[chk_pc].ist == regex.ist_simple_char { |
2224 | if re.prog[chk_pc].ch == ch_t { |
2225 | next_check_flag = true |
2226 | } |
2227 | // println("Check [ist_simple_char] [${re.prog[chk_pc].ch}]==[${ch_t:c}] => $next_check_flag") |
2228 | } |
2229 | // char char_class |
2230 | else if re.prog[chk_pc].ist == regex.ist_char_class_pos |
2231 | || re.prog[chk_pc].ist == regex.ist_char_class_neg { |
2232 | mut cc_neg := false |
2233 | if re.prog[chk_pc].ist == regex.ist_char_class_neg { |
2234 | cc_neg = true |
2235 | } |
2236 | mut cc_res := re.check_char_class(chk_pc, ch_t) |
2237 | |
2238 | if cc_neg { |
2239 | cc_res = !cc_res |
2240 | } |
2241 | next_check_flag = cc_res |
2242 | // println("Check [ist_char_class] => $next_check_flag") |
2243 | } |
2244 | // check bsls |
2245 | else if re.prog[chk_pc].ist == regex.ist_bsls_char { |
2246 | next_check_flag = re.prog[chk_pc].validator(u8(ch_t)) |
2247 | // println("Check [ist_bsls_char] => $next_check_flag") |
2248 | } |
2249 | } |
2250 | |
2251 | // check if we must continue or pass to the next IST |
2252 | if next_check_flag == true && re.prog[state.pc + 1].ist != regex.ist_prog_end { |
2253 | // println("save the state!!") |
2254 | mut dot_state := StateObj{ |
2255 | group_index: state.group_index |
2256 | match_flag: state.match_flag |
2257 | match_index: state.match_index |
2258 | first_match: state.first_match |
2259 | pc: state.pc |
2260 | i: state.i + char_len |
2261 | char_len: char_len |
2262 | last_dot_pc: state.pc |
2263 | } |
2264 | // if we are managing a \[something]* stay on the same char on return |
2265 | if re.prog[state.pc].rep_min == 0 { |
2266 | dot_state.i -= char_len |
2267 | } |
2268 | |
2269 | re.state_list << dot_state |
2270 | |
2271 | m_state = .ist_quant_n |
2272 | // println("dot_char stack len: ${re.state_list.len}") |
2273 | continue |
2274 | } |
2275 | |
2276 | state.match_flag = false |
2277 | mut cc_neg := false |
2278 | |
2279 | if ist == regex.ist_char_class_neg { |
2280 | cc_neg = true |
2281 | } |
2282 | mut cc_res := re.check_char_class(state.pc, ch) |
2283 | |
2284 | if cc_neg { |
2285 | cc_res = !cc_res |
2286 | } |
2287 | |
2288 | if cc_res { |
2289 | state.match_flag = true |
2290 | l_ist = u32(regex.ist_char_class_pos) |
2291 | |
2292 | if state.first_match < 0 { |
2293 | state.first_match = state.i |
2294 | } |
2295 | |
2296 | state.match_index = state.i |
2297 | |
2298 | re.prog[state.pc].rep++ // increase repetitions |
2299 | state.i += char_len // next char |
2300 | m_state = .ist_quant_p |
2301 | continue |
2302 | } |
2303 | m_state = .ist_quant_n |
2304 | continue |
2305 | } |
2306 | // check bsls |
2307 | else if ist == regex.ist_bsls_char { |
2308 | // println("ist_bsls_char rep: ${re.prog[state.pc].rep}") |
2309 | |
2310 | // check next token to be false |
2311 | mut next_check_flag := false |
2312 | |
2313 | // if we are done with max go on dot char are dedicated case!! |
2314 | if re.prog[state.pc].rep >= re.prog[state.pc].rep_max { |
2315 | re.state_list.pop() |
2316 | m_state = .ist_next |
2317 | continue |
2318 | } |
2319 | |
2320 | if re.prog[state.pc].last_dot_flag == false && re.prog[state.pc].bsls_check_pc >= 0 |
2321 | && re.prog[state.pc].rep >= re.prog[state.pc].rep_min { |
2322 | // load the char |
2323 | // ch_t, _ := re.get_charb(in_txt, state.i+char_len) |
2324 | ch_t := ch |
2325 | chk_pc := re.prog[state.pc].bsls_check_pc |
2326 | |
2327 | // simple char |
2328 | if re.prog[chk_pc].ist == regex.ist_simple_char { |
2329 | if re.prog[chk_pc].ch == ch_t { |
2330 | next_check_flag = true |
2331 | } |
2332 | // println("Check [ist_simple_char] [${re.prog[chk_pc].ch}]==[${ch_t:c}] => $next_check_flag") |
2333 | } |
2334 | // char char_class |
2335 | else if re.prog[chk_pc].ist == regex.ist_char_class_pos |
2336 | || re.prog[chk_pc].ist == regex.ist_char_class_neg { |
2337 | mut cc_neg := false |
2338 | if re.prog[chk_pc].ist == regex.ist_char_class_neg { |
2339 | cc_neg = true |
2340 | } |
2341 | mut cc_res := re.check_char_class(chk_pc, ch_t) |
2342 | |
2343 | if cc_neg { |
2344 | cc_res = !cc_res |
2345 | } |
2346 | next_check_flag = cc_res |
2347 | // println("Check [ist_char_class] => $next_check_flag") |
2348 | } |
2349 | // check bsls |
2350 | else if re.prog[chk_pc].ist == regex.ist_bsls_char { |
2351 | next_check_flag = re.prog[chk_pc].validator(u8(ch_t)) |
2352 | // println("Check [ist_bsls_char] => $next_check_flag") |
2353 | } |
2354 | } |
2355 | |
2356 | // check if we must continue or pass to the next IST |
2357 | if next_check_flag == true && re.prog[state.pc + 1].ist != regex.ist_prog_end { |
2358 | // println("save the state!!") |
2359 | mut dot_state := StateObj{ |
2360 | group_index: state.group_index |
2361 | match_flag: state.match_flag |
2362 | match_index: state.match_index |
2363 | first_match: state.first_match |
2364 | pc: state.pc |
2365 | i: state.i + char_len |
2366 | char_len: char_len |
2367 | last_dot_pc: state.pc |
2368 | } |
2369 | // if we are managing a \[something]* stay on the same char on return |
2370 | if re.prog[state.pc].rep_min == 0 { |
2371 | dot_state.i -= char_len |
2372 | } |
2373 | |
2374 | re.state_list << dot_state |
2375 | |
2376 | m_state = .ist_quant_n |
2377 | // println("dot_char stack len: ${re.state_list.len}") |
2378 | continue |
2379 | } |
2380 | |
2381 | tmp_res := re.prog[state.pc].validator(u8(ch)) |
2382 | if tmp_res == false { |
2383 | m_state = .ist_quant_n |
2384 | continue |
2385 | } |
2386 | // println("${ch} => ${tmp_res}") |
2387 | |
2388 | state.match_flag = true |
2389 | l_ist = u32(regex.ist_dot_char) |
2390 | |
2391 | if state.first_match < 0 { |
2392 | state.first_match = state.i |
2393 | } |
2394 | state.match_index = state.i |
2395 | re.prog[state.pc].rep++ // increase repetitions |
2396 | |
2397 | state.i += char_len |
2398 | m_state = .ist_quant_p |
2399 | continue |
2400 | } |
2401 | // simple char IST |
2402 | else if ist == regex.ist_simple_char { |
2403 | // println("ist_simple_char") |
2404 | state.match_flag = false |
2405 | |
2406 | if re.prog[state.pc].ch == ch { |
2407 | state.match_flag = true |
2408 | l_ist = regex.ist_simple_char |
2409 | |
2410 | if state.first_match < 0 { |
2411 | state.first_match = state.i |
2412 | } |
2413 | // println("state.match_index: ${state.match_index}") |
2414 | state.match_index = state.i |
2415 | |
2416 | re.prog[state.pc].rep++ // increase repetitions |
2417 | state.i += char_len // next char |
2418 | m_state = .ist_quant_p |
2419 | continue |
2420 | } |
2421 | m_state = .ist_quant_n |
2422 | continue |
2423 | } |
2424 | // UNREACHABLE |
2425 | // println("PANIC2!! state: $m_state") |
2426 | return regex.err_internal_error, state.i |
2427 | } |
2428 | /*********************************** |
2429 | * Quantifier management |
2430 | ***********************************/ |
2431 | // ist_quant_ng => quantifier negative test on group |
2432 | else if m_state == .ist_quant_ng { |
2433 | // we are finished here |
2434 | if state.group_index < 0 { |
2435 | // println("Early stop!") |
2436 | result = regex.no_match_found |
2437 | m_state = .stop |
2438 | continue |
2439 | } |
2440 | |
2441 | tmp_pc := re.group_data[state.group_index] // PC to the end of the group token |
2442 | rep := re.prog[tmp_pc].group_rep // use a temp variable |
2443 | re.prog[tmp_pc].group_rep = 0 // clear the repetitions |
2444 | |
2445 | // println(".ist_quant_ng group_pc_end: $tmp_pc rep: $rep") |
2446 | |
2447 | if rep >= re.prog[tmp_pc].rep_min { |
2448 | // println("ist_quant_ng GROUP CLOSED OK group_index: $state.group_index") |
2449 | |
2450 | state.i = re.group_stack[state.group_index] |
2451 | state.pc = tmp_pc |
2452 | state.group_index-- |
2453 | m_state = .ist_next |
2454 | continue |
2455 | } else if re.prog[tmp_pc].next_is_or { |
2456 | // println("ist_quant_ng OR Negative branch") |
2457 | |
2458 | state.i = re.group_stack[state.group_index] |
2459 | state.pc = re.prog[tmp_pc + 1].rep_min - 1 |
2460 | state.group_index-- |
2461 | m_state = .ist_next |
2462 | continue |
2463 | } else if rep > 0 && rep < re.prog[tmp_pc].rep_min { |
2464 | // println("ist_quant_ng UNDER THE MINIMUM g.i: $state.group_index") |
2465 | |
2466 | // check if we are inside a group, if yes exit from the nested groups |
2467 | if state.group_index > 0 { |
2468 | state.group_index-- |
2469 | state.pc = tmp_pc |
2470 | m_state = .ist_quant_ng //.ist_next |
2471 | continue |
2472 | } |
2473 | |
2474 | if state.group_index == 0 { |
2475 | state.group_index-- |
2476 | state.pc = tmp_pc // TEST |
2477 | m_state = .ist_next |
2478 | continue |
2479 | } |
2480 | |
2481 | result = regex.no_match_found |
2482 | m_state = .stop |
2483 | continue |
2484 | } else if rep == 0 && rep < re.prog[tmp_pc].rep_min { |
2485 | // println("ist_quant_ng c_zero UNDER THE MINIMUM g.i: $state.group_index") |
2486 | |
2487 | if state.group_index > 0 { |
2488 | state.group_index-- |
2489 | state.pc = tmp_pc |
2490 | m_state = .ist_quant_ng //.ist_next |
2491 | continue |
2492 | } |
2493 | |
2494 | result = regex.no_match_found |
2495 | m_state = .stop |
2496 | continue |
2497 | } |
2498 | |
2499 | // println("DO NOT STAY HERE!! {${re.prog[tmp_pc].rep_min},${re.prog[tmp_pc].rep_max}}:$rep") |
2500 | // UNREACHABLE |
2501 | return regex.err_internal_error, state.i |
2502 | } |
2503 | // ist_quant_pg => quantifier positive test on group |
2504 | else if m_state == .ist_quant_pg { |
2505 | // println(".ist_quant_pg") |
2506 | mut tmp_pc := state.pc |
2507 | if state.group_index >= 0 { |
2508 | tmp_pc = re.group_data[state.group_index] |
2509 | } |
2510 | |
2511 | if re.prog[tmp_pc].group_neg == true { |
2512 | // println("***** Negation of the group") |
2513 | result = regex.no_match_found |
2514 | m_state = .stop |
2515 | continue |
2516 | } |
2517 | |
2518 | rep := re.prog[tmp_pc].group_rep |
2519 | |
2520 | if rep < re.prog[tmp_pc].rep_min { |
2521 | // println("ist_quant_pg UNDER RANGE") |
2522 | state.pc = re.prog[tmp_pc].goto_pc |
2523 | m_state = .ist_next |
2524 | continue |
2525 | } else if rep == re.prog[tmp_pc].rep_max { |
2526 | // println("ist_quant_pg MAX RANGE") |
2527 | re.prog[tmp_pc].group_rep = 0 // clear the repetitions |
2528 | state.group_index-- |
2529 | m_state = .ist_next |
2530 | |
2531 | continue |
2532 | } else if rep >= re.prog[tmp_pc].rep_min { |
2533 | // println("ist_quant_pg IN RANGE group_index:$state.group_index") |
2534 | |
2535 | // check greedy flag, if true exit on minimum |
2536 | if re.prog[tmp_pc].greedy == true { |
2537 | re.prog[tmp_pc].group_rep = 0 // clear the repetitions |
2538 | state.group_index-- |
2539 | m_state = .ist_next |
2540 | continue |
2541 | } |
2542 | |
2543 | state.pc = re.prog[tmp_pc].goto_pc - 1 |
2544 | state.group_index-- |
2545 | m_state = .ist_next |
2546 | continue |
2547 | } |
2548 | |
2549 | // UNREACHABLE |
2550 | // println("PANIC3!! state: $m_state") |
2551 | return regex.err_internal_error, state.i |
2552 | } |
2553 | // ist_quant_n => quantifier negative test on token |
2554 | else if m_state == .ist_quant_n { |
2555 | rep := re.prog[state.pc].rep |
2556 | // println("Here!! PC $state.pc is_next_or: ${re.prog[state.pc].next_is_or}") |
2557 | |
2558 | // zero quantifier * or ? |
2559 | if rep == 0 && re.prog[state.pc].rep_min == 0 { |
2560 | // println("ist_quant_n c_zero RANGE MIN") |
2561 | m_state = .ist_next // go to next ist |
2562 | continue |
2563 | } |
2564 | // match + or * |
2565 | else if rep >= re.prog[state.pc].rep_min { |
2566 | // println("ist_quant_n MATCH RANGE") |
2567 | m_state = .ist_next |
2568 | continue |
2569 | } |
2570 | |
2571 | // check the OR if present |
2572 | if re.prog[state.pc].next_is_or { |
2573 | // println("OR present on failing") |
2574 | state.match_index = -1 |
2575 | m_state = .ist_next |
2576 | continue |
2577 | } |
2578 | |
2579 | // we are in a group manage no match from here |
2580 | if state.group_index >= 0 { |
2581 | // println("ist_quant_n FAILED insied a GROUP group_index:$state.group_index") |
2582 | m_state = .ist_quant_ng |
2583 | continue |
2584 | } |
2585 | |
2586 | // no other options |
2587 | // println("ist_quant_n no_match_found") |
2588 | result = regex.no_match_found |
2589 | m_state = .stop |
2590 | |
2591 | // stop already started matching outside a capturing group |
2592 | if re.state_list.len > 0 && re.state_list.last().group_index == -1 |
2593 | && re.state_list.last().last_dot_pc > 0 { |
2594 | if ist == regex.ist_dot_char || ist == regex.ist_bsls_char { |
2595 | return regex.no_match_found, 0 |
2596 | } |
2597 | } |
2598 | continue |
2599 | } |
2600 | // ist_quant_p => quantifier positive test on token |
2601 | else if m_state == .ist_quant_p { |
2602 | // println("Here .ist_quant_p") |
2603 | // exit on first match |
2604 | if (re.flag & regex.f_efm) != 0 { |
2605 | return state.i, state.i + 1 |
2606 | } |
2607 | |
2608 | rep := re.prog[state.pc].rep |
2609 | // println(rep) |
2610 | |
2611 | // under range |
2612 | if rep > 0 && rep < re.prog[state.pc].rep_min { |
2613 | // println("ist_quant_p UNDER RANGE") |
2614 | m_state = .ist_load // continue the loop |
2615 | continue |
2616 | } |
2617 | // range ok, continue loop |
2618 | else if rep >= re.prog[state.pc].rep_min && rep < re.prog[state.pc].rep_max { |
2619 | // println("ist_quant_p IN RANGE") |
2620 | |
2621 | // check greedy flag, if true exit on minimum |
2622 | if re.prog[state.pc].greedy == true { |
2623 | m_state = .ist_next |
2624 | continue |
2625 | } |
2626 | m_state = .ist_load |
2627 | continue |
2628 | } |
2629 | // max reached |
2630 | else if rep == re.prog[state.pc].rep_max { |
2631 | // println("ist_quant_p MAX RANGE") |
2632 | m_state = .ist_next |
2633 | continue |
2634 | } |
2635 | } |
2636 | // UNREACHABLE |
2637 | // println("PANIC4!! state: $m_state") |
2638 | return regex.err_internal_error, state.i |
2639 | } |
2640 | |
2641 | // println("Check end of text!") |
2642 | // Check the results |
2643 | if state.match_index >= 0 { |
2644 | if state.group_index < 0 { |
2645 | if re.prog[state.pc].ist == regex.ist_prog_end { |
2646 | // println("program ended!!") |
2647 | |
2648 | if (re.flag & regex.f_src) != 0 { |
2649 | // println("find return") |
2650 | return state.first_match, state.i |
2651 | } else { |
2652 | // println("Here!!") |
2653 | return 0, state.i |
2654 | } |
2655 | } |
2656 | |
2657 | // println("No Group here, natural end [$state.first_match,$state.i] state: ${state_str(m_state)} ist: $ist pgr_end: $re.prog.len") |
2658 | |
2659 | if re.prog[state.pc + 1].ist == regex.ist_prog_end |
2660 | || re.prog[state.pc].ist == regex.ist_prog_end { |
2661 | rep := re.prog[state.pc].rep |
2662 | // println("rep: $rep re.prog[state.pc].rep_min: ${re.prog[state.pc].rep_min} re.prog[state.pc].rep_max: ${re.prog[state.pc].rep_max}") |
2663 | if rep >= re.prog[state.pc].rep_min && rep <= re.prog[state.pc].rep_max { |
2664 | return state.first_match, state.i |
2665 | } |
2666 | // println("Program not finished! ") |
2667 | return regex.no_match_found, state.i |
2668 | } |
2669 | if src_end { |
2670 | // println("program end") |
2671 | return state.first_match, state.i |
2672 | } |
2673 | // print("No match found!!") |
2674 | return regex.no_match_found, state.i |
2675 | } else { |
2676 | // println("Group match! OK") |
2677 | // println("first_match: $state.first_match, i: $state.i") |
2678 | |
2679 | // println("Skip last group") |
2680 | return state.first_match, state.i |
2681 | // return state.first_match,re.group_stack[state.group_index--] |
2682 | } |
2683 | } |
2684 | // println("no_match_found, natural end") |
2685 | return regex.no_match_found, state.i |
2686 | } |