v / vlib / strings
Raw file | 69 loc (66 sloc) | 1.8 KB | Latest commit hash 6c341a77f
1module strings
2
3// #-js
4// use levenshtein distance algorithm to calculate
5// the distance between between two strings (lower is closer)
6pub fn levenshtein_distance(a string, b string) int {
7 mut f := [0].repeat(b.len + 1)
8 for j in 0 .. f.len {
9 f[j] = j
10 }
11 for ca in a {
12 mut j := 1
13 mut fj1 := f[0]
14 f[0]++
15 for cb in b {
16 mut mn := if f[j] + 1 <= f[j - 1] + 1 { f[j] + 1 } else { f[j - 1] + 1 }
17 if cb != ca {
18 mn = if mn <= fj1 + 1 { mn } else { fj1 + 1 }
19 } else {
20 mn = if mn <= fj1 { mn } else { fj1 }
21 }
22 fj1 = f[j]
23 f[j] = mn
24 j++
25 }
26 }
27 return f[f.len - 1]
28}
29
30// use levenshtein distance algorithm to calculate
31// how similar two strings are as a percentage (higher is closer)
32pub fn levenshtein_distance_percentage(a string, b string) f32 {
33 d := levenshtein_distance(a, b)
34 l := if a.len >= b.len { a.len } else { b.len }
35 return (1.00 - f32(d) / f32(l)) * 100.00
36}
37
38// implementation of Sørensen–Dice coefficient.
39// find the similarity between two strings.
40// returns coefficient between 0.0 (not similar) and 1.0 (exact match).
41pub fn dice_coefficient(s1 string, s2 string) f32 {
42 if s1.len == 0 || s2.len == 0 {
43 return 0.0
44 }
45 if s1 == s2 {
46 return 1.0
47 }
48 if s1.len < 2 || s2.len < 2 {
49 return 0.0
50 }
51 a := if s1.len > s2.len { s1 } else { s2 }
52 b := if a == s1 { s2 } else { s1 }
53 mut first_bigrams := map[string]int{}
54 for i in 0 .. a.len - 1 {
55 bigram := a[i..i + 2]
56 q := if bigram in first_bigrams { first_bigrams[bigram] + 1 } else { 1 }
57 first_bigrams[bigram] = q
58 }
59 mut intersection_size := 0
60 for i in 0 .. b.len - 1 {
61 bigram := b[i..i + 2]
62 count := if bigram in first_bigrams { first_bigrams[bigram] } else { 0 }
63 if count > 0 {
64 first_bigrams[bigram] = count - 1
65 intersection_size++
66 }
67 }
68 return (2.0 * f32(intersection_size)) / (f32(a.len) + f32(b.len) - 2)
69}