Gitly

v / vlib / strings

Raw file | 69 loc (66 sloc) | 1.8 KB | Latest commit hash 6c341a77f


1 module strings
2 
3 // #-js
4 // use levenshtein distance algorithm to calculate
5 // the distance between between two strings (lower is closer)
6 pub fn levenshtein_distance(a string, b string) int {
7     mut f := [0].repeat(b.len + 1)
8     for j in 0 .. f.len {
9         f[j] = j
10     }
11     for ca in a {
12         mut j := 1
13         mut fj1 := f[0]
14         f[0]++
15         for cb in b {
16             mut mn := if f[j] + 1 <= f[j - 1] + 1 { f[j] + 1 } else { f[j - 1] + 1 }
17             if cb != ca {
18                 mn = if mn <= fj1 + 1 { mn } else { fj1 + 1 }
19             } else {
20                 mn = if mn <= fj1 { mn } else { fj1 }
21             }
22             fj1 = f[j]
23             f[j] = mn
24             j++
25         }
26     }
27     return f[f.len - 1]
28 }
29 
30 // use levenshtein distance algorithm to calculate
31 // how similar two strings are as a percentage (higher is closer)
32 pub fn levenshtein_distance_percentage(a string, b string) f32 {
33     d := levenshtein_distance(a, b)
34     l := if a.len >= b.len { a.len } else { b.len }
35     return (1.00 - f32(d) / f32(l)) * 100.00
36 }
37 
38 // implementation of Sørensen–Dice coefficient.
39 // find the similarity between two strings.
40 // returns coefficient between 0.0 (not similar) and 1.0 (exact match).
41 pub fn dice_coefficient(s1 string, s2 string) f32 {
42     if s1.len == 0 || s2.len == 0 {
43         return 0.0
44     }
45     if s1 == s2 {
46         return 1.0
47     }
48     if s1.len < 2 || s2.len < 2 {
49         return 0.0
50     }
51     a := if s1.len > s2.len { s1 } else { s2 }
52     b := if a == s1 { s2 } else { s1 }
53     mut first_bigrams := map[string]int{}
54     for i in 0 .. a.len - 1 {
55         bigram := a[i..i + 2]
56         q := if bigram in first_bigrams { first_bigrams[bigram] + 1 } else { 1 }
57         first_bigrams[bigram] = q
58     }
59     mut intersection_size := 0
60     for i in 0 .. b.len - 1 {
61         bigram := b[i..i + 2]
62         count := if bigram in first_bigrams { first_bigrams[bigram] } else { 0 }
63         if count > 0 {
64             first_bigrams[bigram] = count - 1
65             intersection_size++
66         }
67     }
68     return (2.0 * f32(intersection_size)) / (f32(a.len) + f32(b.len) - 2)
69 }

1	module strings
2
3	// #-js
4	// use levenshtein distance algorithm to calculate
5	// the distance between between two strings (lower is closer)
6	pub fn levenshtein_distance(a string, b string) int {
7	mut f := [0].repeat(b.len + 1)
8	for j in 0 .. f.len {
9	f[j] = j
10	}
11	for ca in a {
12	mut j := 1
13	mut fj1 := f[0]
14	f[0]++
15	for cb in b {
16	mut mn := if f[j] + 1 <= f[j - 1] + 1 { f[j] + 1 } else { f[j - 1] + 1 }
17	if cb != ca {
18	mn = if mn <= fj1 + 1 { mn } else { fj1 + 1 }
19	} else {
20	mn = if mn <= fj1 { mn } else { fj1 }
21	}
22	fj1 = f[j]
23	f[j] = mn
24	j++
25	}
26	}
27	return f[f.len - 1]
28	}
29
30	// use levenshtein distance algorithm to calculate
31	// how similar two strings are as a percentage (higher is closer)
32	pub fn levenshtein_distance_percentage(a string, b string) f32 {
33	d := levenshtein_distance(a, b)
34	l := if a.len >= b.len { a.len } else { b.len }
35	return (1.00 - f32(d) / f32(l)) * 100.00
36	}
37
38	// implementation of Sørensen–Dice coefficient.
39	// find the similarity between two strings.
40	// returns coefficient between 0.0 (not similar) and 1.0 (exact match).
41	pub fn dice_coefficient(s1 string, s2 string) f32 {
42	if s1.len == 0 \|\| s2.len == 0 {
43	return 0.0
44	}
45	if s1 == s2 {
46	return 1.0
47	}
48	if s1.len < 2 \|\| s2.len < 2 {
49	return 0.0
50	}
51	a := if s1.len > s2.len { s1 } else { s2 }
52	b := if a == s1 { s2 } else { s1 }
53	mut first_bigrams := map[string]int{}
54	for i in 0 .. a.len - 1 {
55	bigram := a[i..i + 2]
56	q := if bigram in first_bigrams { first_bigrams[bigram] + 1 } else { 1 }
57	first_bigrams[bigram] = q
58	}
59	mut intersection_size := 0
60	for i in 0 .. b.len - 1 {
61	bigram := b[i..i + 2]
62	count := if bigram in first_bigrams { first_bigrams[bigram] } else { 0 }
63	if count > 0 {
64	first_bigrams[bigram] = count - 1
65	intersection_size++
66	}
67	}
68	return (2.0 * f32(intersection_size)) / (f32(a.len) + f32(b.len) - 2)
69	}