]> git.immae.eu Git - github/fretlink/terraform-provider-statuscake.git/blob - vendor/github.com/agext/levenshtein/levenshtein.go
deps: github.com/hashicorp/terraform@sdk-v0.11-with-go-modules
[github/fretlink/terraform-provider-statuscake.git] / vendor / github.com / agext / levenshtein / levenshtein.go
1 // Copyright 2016 ALRUX Inc.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 /*
16 Package levenshtein implements distance and similarity metrics for strings, based on the Levenshtein measure.
17
18 The Levenshtein `Distance` between two strings is the minimum total cost of edits that would convert the first string into the second. The allowed edit operations are insertions, deletions, and substitutions, all at character (one UTF-8 code point) level. Each operation has a default cost of 1, but each can be assigned its own cost equal to or greater than 0.
19
20 A `Distance` of 0 means the two strings are identical, and the higher the value the more different the strings. Since in practice we are interested in finding if the two strings are "close enough", it often does not make sense to continue the calculation once the result is mathematically guaranteed to exceed a desired threshold. Providing this value to the `Distance` function allows it to take a shortcut and return a lower bound instead of an exact cost when the threshold is exceeded.
21
22 The `Similarity` function calculates the distance, then converts it into a normalized metric within the range 0..1, with 1 meaning the strings are identical, and 0 that they have nothing in common. A minimum similarity threshold can be provided to speed up the calculation of the metric for strings that are far too dissimilar for the purpose at hand. All values under this threshold are rounded down to 0.
23
24 The `Match` function provides a similarity metric, with the same range and meaning as `Similarity`, but with a bonus for string pairs that share a common prefix and have a similarity above a "bonus threshold". It uses the same method as proposed by Winkler for the Jaro distance, and the reasoning behind it is that these string pairs are very likely spelling variations or errors, and they are more closely linked than the edit distance alone would suggest.
25
26 The underlying `Calculate` function is also exported, to allow the building of other derivative metrics, if needed.
27 */
28 package levenshtein
29
30 // Calculate determines the Levenshtein distance between two strings, using
31 // the given costs for each edit operation. It returns the distance along with
32 // the lengths of the longest common prefix and suffix.
33 //
34 // If maxCost is non-zero, the calculation stops as soon as the distance is determined
35 // to be greater than maxCost. Therefore, any return value higher than maxCost is a
36 // lower bound for the actual distance.
37 func Calculate(str1, str2 []rune, maxCost, insCost, subCost, delCost int) (dist, prefixLen, suffixLen int) {
38 l1, l2 := len(str1), len(str2)
39 // trim common prefix, if any, as it doesn't affect the distance
40 for ; prefixLen < l1 && prefixLen < l2; prefixLen++ {
41 if str1[prefixLen] != str2[prefixLen] {
42 break
43 }
44 }
45 str1, str2 = str1[prefixLen:], str2[prefixLen:]
46 l1 -= prefixLen
47 l2 -= prefixLen
48 // trim common suffix, if any, as it doesn't affect the distance
49 for 0 < l1 && 0 < l2 {
50 if str1[l1-1] != str2[l2-1] {
51 str1, str2 = str1[:l1], str2[:l2]
52 break
53 }
54 l1--
55 l2--
56 suffixLen++
57 }
58 // if the first string is empty, the distance is the length of the second string times the cost of insertion
59 if l1 == 0 {
60 dist = l2 * insCost
61 return
62 }
63 // if the second string is empty, the distance is the length of the first string times the cost of deletion
64 if l2 == 0 {
65 dist = l1 * delCost
66 return
67 }
68
69 // variables used in inner "for" loops
70 var y, dy, c, l int
71
72 // if maxCost is greater than or equal to the maximum possible distance, it's equivalent to 'unlimited'
73 if maxCost > 0 {
74 if subCost < delCost+insCost {
75 if maxCost >= l1*subCost+(l2-l1)*insCost {
76 maxCost = 0
77 }
78 } else {
79 if maxCost >= l1*delCost+l2*insCost {
80 maxCost = 0
81 }
82 }
83 }
84
85 if maxCost > 0 {
86 // prefer the longer string first, to minimize time;
87 // a swap also transposes the meanings of insertion and deletion.
88 if l1 < l2 {
89 str1, str2, l1, l2, insCost, delCost = str2, str1, l2, l1, delCost, insCost
90 }
91
92 // the length differential times cost of deletion is a lower bound for the cost;
93 // if it is higher than the maxCost, there is no point going into the main calculation.
94 if dist = (l1 - l2) * delCost; dist > maxCost {
95 return
96 }
97
98 d := make([]int, l1+1)
99
100 // offset and length of d in the current row
101 doff, dlen := 0, 1
102 for y, dy = 1, delCost; y <= l1 && dy <= maxCost; dlen++ {
103 d[y] = dy
104 y++
105 dy = y * delCost
106 }
107 // fmt.Printf("%q -> %q: init doff=%d dlen=%d d[%d:%d]=%v\n", str1, str2, doff, dlen, doff, doff+dlen, d[doff:doff+dlen])
108
109 for x := 0; x < l2; x++ {
110 dy, d[doff] = d[doff], d[doff]+insCost
111 for d[doff] > maxCost && dlen > 0 {
112 if str1[doff] != str2[x] {
113 dy += subCost
114 }
115 doff++
116 dlen--
117 if c = d[doff] + insCost; c < dy {
118 dy = c
119 }
120 dy, d[doff] = d[doff], dy
121 }
122 for y, l = doff, doff+dlen-1; y < l; dy, d[y] = d[y], dy {
123 if str1[y] != str2[x] {
124 dy += subCost
125 }
126 if c = d[y] + delCost; c < dy {
127 dy = c
128 }
129 y++
130 if c = d[y] + insCost; c < dy {
131 dy = c
132 }
133 }
134 if y < l1 {
135 if str1[y] != str2[x] {
136 dy += subCost
137 }
138 if c = d[y] + delCost; c < dy {
139 dy = c
140 }
141 for ; dy <= maxCost && y < l1; dy, d[y] = dy+delCost, dy {
142 y++
143 dlen++
144 }
145 }
146 // fmt.Printf("%q -> %q: x=%d doff=%d dlen=%d d[%d:%d]=%v\n", str1, str2, x, doff, dlen, doff, doff+dlen, d[doff:doff+dlen])
147 if dlen == 0 {
148 dist = maxCost + 1
149 return
150 }
151 }
152 if doff+dlen-1 < l1 {
153 dist = maxCost + 1
154 return
155 }
156 dist = d[l1]
157 } else {
158 // ToDo: This is O(l1*l2) time and O(min(l1,l2)) space; investigate if it is
159 // worth to implement diagonal approach - O(l1*(1+dist)) time, up to O(l1*l2) space
160 // http://www.csse.monash.edu.au/~lloyd/tildeStrings/Alignment/92.IPL.html
161
162 // prefer the shorter string first, to minimize space; time is O(l1*l2) anyway;
163 // a swap also transposes the meanings of insertion and deletion.
164 if l1 > l2 {
165 str1, str2, l1, l2, insCost, delCost = str2, str1, l2, l1, delCost, insCost
166 }
167 d := make([]int, l1+1)
168
169 for y = 1; y <= l1; y++ {
170 d[y] = y * delCost
171 }
172 for x := 0; x < l2; x++ {
173 dy, d[0] = d[0], d[0]+insCost
174 for y = 0; y < l1; dy, d[y] = d[y], dy {
175 if str1[y] != str2[x] {
176 dy += subCost
177 }
178 if c = d[y] + delCost; c < dy {
179 dy = c
180 }
181 y++
182 if c = d[y] + insCost; c < dy {
183 dy = c
184 }
185 }
186 }
187 dist = d[l1]
188 }
189
190 return
191 }
192
193 // Distance returns the Levenshtein distance between str1 and str2, using the
194 // default or provided cost values. Pass nil for the third argument to use the
195 // default cost of 1 for all three operations, with no maximum.
196 func Distance(str1, str2 string, p *Params) int {
197 if p == nil {
198 p = defaultParams
199 }
200 dist, _, _ := Calculate([]rune(str1), []rune(str2), p.maxCost, p.insCost, p.subCost, p.delCost)
201 return dist
202 }
203
204 // Similarity returns a score in the range of 0..1 for how similar the two strings are.
205 // A score of 1 means the strings are identical, and 0 means they have nothing in common.
206 //
207 // A nil third argument uses the default cost of 1 for all three operations.
208 //
209 // If a non-zero MinScore value is provided in the parameters, scores lower than it
210 // will be returned as 0.
211 func Similarity(str1, str2 string, p *Params) float64 {
212 return Match(str1, str2, p.Clone().BonusThreshold(1.1)) // guaranteed no bonus
213 }
214
215 // Match returns a similarity score adjusted by the same method as proposed by Winkler for
216 // the Jaro distance - giving a bonus to string pairs that share a common prefix, only if their
217 // similarity score is already over a threshold.
218 //
219 // The score is in the range of 0..1, with 1 meaning the strings are identical,
220 // and 0 meaning they have nothing in common.
221 //
222 // A nil third argument uses the default cost of 1 for all three operations, maximum length of
223 // common prefix to consider for bonus of 4, scaling factor of 0.1, and bonus threshold of 0.7.
224 //
225 // If a non-zero MinScore value is provided in the parameters, scores lower than it
226 // will be returned as 0.
227 func Match(str1, str2 string, p *Params) float64 {
228 s1, s2 := []rune(str1), []rune(str2)
229 l1, l2 := len(s1), len(s2)
230 // two empty strings are identical; shortcut also avoids divByZero issues later on.
231 if l1 == 0 && l2 == 0 {
232 return 1
233 }
234
235 if p == nil {
236 p = defaultParams
237 }
238
239 // a min over 1 can never be satisfied, so the score is 0.
240 if p.minScore > 1 {
241 return 0
242 }
243
244 insCost, delCost, maxDist, max := p.insCost, p.delCost, 0, 0
245 if l1 > l2 {
246 l1, l2, insCost, delCost = l2, l1, delCost, insCost
247 }
248
249 if p.subCost < delCost+insCost {
250 maxDist = l1*p.subCost + (l2-l1)*insCost
251 } else {
252 maxDist = l1*delCost + l2*insCost
253 }
254
255 // a zero min is always satisfied, so no need to set a max cost.
256 if p.minScore > 0 {
257 // if p.minScore is lower than p.bonusThreshold, we can use a simplified formula
258 // for the max cost, because a sim score below min cannot receive a bonus.
259 if p.minScore < p.bonusThreshold {
260 // round down the max - a cost equal to a rounded up max would already be under min.
261 max = int((1 - p.minScore) * float64(maxDist))
262 } else {
263 // p.minScore <= sim + p.bonusPrefix*p.bonusScale*(1-sim)
264 // p.minScore <= (1-dist/maxDist) + p.bonusPrefix*p.bonusScale*(1-(1-dist/maxDist))
265 // p.minScore <= 1 - dist/maxDist + p.bonusPrefix*p.bonusScale*dist/maxDist
266 // 1 - p.minScore >= dist/maxDist - p.bonusPrefix*p.bonusScale*dist/maxDist
267 // (1-p.minScore)*maxDist/(1-p.bonusPrefix*p.bonusScale) >= dist
268 max = int((1 - p.minScore) * float64(maxDist) / (1 - float64(p.bonusPrefix)*p.bonusScale))
269 }
270 }
271
272 dist, pl, _ := Calculate(s1, s2, max, p.insCost, p.subCost, p.delCost)
273 if max > 0 && dist > max {
274 return 0
275 }
276 sim := 1 - float64(dist)/float64(maxDist)
277
278 if sim >= p.bonusThreshold && sim < 1 && p.bonusPrefix > 0 && p.bonusScale > 0 {
279 if pl > p.bonusPrefix {
280 pl = p.bonusPrefix
281 }
282 sim += float64(pl) * p.bonusScale * (1 - sim)
283 }
284
285 if sim < p.minScore {
286 return 0
287 }
288
289 return sim
290 }