vendor/github.com/agext/levenshtein/levenshtein.go

   1 // Copyright 2016 ALRUX Inc.
   2 //
   3 // Licensed under the Apache License, Version 2.0 (the "License");
   4 // you may not use this file except in compliance with the License.
   5 // You may obtain a copy of the License at
   6 //
   7 //    http://www.apache.org/licenses/LICENSE-2.0
   8 //
   9 // Unless required by applicable law or agreed to in writing, software
  10 // distributed under the License is distributed on an "AS IS" BASIS,
  11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 // See the License for the specific language governing permissions and
  13 // limitations under the License.
  14
  15 /*
  16 Package levenshtein implements distance and similarity metrics for strings, based on the Levenshtein measure.
  17
  18 The Levenshtein `Distance` between two strings is the minimum total cost of edits that would convert the first string into the second. The allowed edit operations are insertions, deletions, and substitutions, all at character (one UTF-8 code point) level. Each operation has a default cost of 1, but each can be assigned its own cost equal to or greater than 0.
  19
  20 A `Distance` of 0 means the two strings are identical, and the higher the value the more different the strings. Since in practice we are interested in finding if the two strings are "close enough", it often does not make sense to continue the calculation once the result is mathematically guaranteed to exceed a desired threshold. Providing this value to the `Distance` function allows it to take a shortcut and return a lower bound instead of an exact cost when the threshold is exceeded.
  21
  22 The `Similarity` function calculates the distance, then converts it into a normalized metric within the range 0..1, with 1 meaning the strings are identical, and 0 that they have nothing in common. A minimum similarity threshold can be provided to speed up the calculation of the metric for strings that are far too dissimilar for the purpose at hand. All values under this threshold are rounded down to 0.
  23
  24 The `Match` function provides a similarity metric, with the same range and meaning as `Similarity`, but with a bonus for string pairs that share a common prefix and have a similarity above a "bonus threshold". It uses the same method as proposed by Winkler for the Jaro distance, and the reasoning behind it is that these string pairs are very likely spelling variations or errors, and they are more closely linked than the edit distance alone would suggest.
  25
  26 The underlying `Calculate` function is also exported, to allow the building of other derivative metrics, if needed.
  27 */
  28 package levenshtein
  29
  30 // Calculate determines the Levenshtein distance between two strings, using
  31 // the given costs for each edit operation. It returns the distance along with
  32 // the lengths of the longest common prefix and suffix.
  33 //
  34 // If maxCost is non-zero, the calculation stops as soon as the distance is determined
  35 // to be greater than maxCost. Therefore, any return value higher than maxCost is a
  36 // lower bound for the actual distance.
  37 func Calculate(str1, str2 []rune, maxCost, insCost, subCost, delCost int) (dist, prefixLen, suffixLen int) {
  38         l1, l2 := len(str1), len(str2)
  39         // trim common prefix, if any, as it doesn't affect the distance
  40         for ; prefixLen < l1 && prefixLen < l2; prefixLen++ {
  41                 if str1[prefixLen] != str2[prefixLen] {
  42                         break
  43                 }
  44         }
  45         str1, str2 = str1[prefixLen:], str2[prefixLen:]
  46         l1 -= prefixLen
  47         l2 -= prefixLen
  48         // trim common suffix, if any, as it doesn't affect the distance
  49         for 0 < l1 && 0 < l2 {
  50                 if str1[l1-1] != str2[l2-1] {
  51                         str1, str2 = str1[:l1], str2[:l2]
  52                         break
  53                 }
  54                 l1--
  55                 l2--
  56                 suffixLen++
  57         }
  58         // if the first string is empty, the distance is the length of the second string times the cost of insertion
  59         if l1 == 0 {
  60                 dist = l2 * insCost
  61                 return
  62         }
  63         // if the second string is empty, the distance is the length of the first string times the cost of deletion
  64         if l2 == 0 {
  65                 dist = l1 * delCost
  66                 return
  67         }
  68
  69         // variables used in inner "for" loops
  70         var y, dy, c, l int
  71
  72         // if maxCost is greater than or equal to the maximum possible distance, it's equivalent to 'unlimited'
  73         if maxCost > 0 {
  74                 if subCost < delCost+insCost {
  75                         if maxCost >= l1*subCost+(l2-l1)*insCost {
  76                                 maxCost = 0
  77                         }
  78                 } else {
  79                         if maxCost >= l1*delCost+l2*insCost {
  80                                 maxCost = 0
  81                         }
  82                 }
  83         }
  84
  85         if maxCost > 0 {
  86                 // prefer the longer string first, to minimize time;
  87                 // a swap also transposes the meanings of insertion and deletion.
  88                 if l1 < l2 {
  89                         str1, str2, l1, l2, insCost, delCost = str2, str1, l2, l1, delCost, insCost
  90                 }
  91
  92                 // the length differential times cost of deletion is a lower bound for the cost;
  93                 // if it is higher than the maxCost, there is no point going into the main calculation.
  94                 if dist = (l1 - l2) * delCost; dist > maxCost {
  95                         return
  96                 }
  97
  98                 d := make([]int, l1+1)
  99
 100                 // offset and length of d in the current row
 101                 doff, dlen := 0, 1
 102                 for y, dy = 1, delCost; y <= l1 && dy <= maxCost; dlen++ {
 103                         d[y] = dy
 104                         y++
 105                         dy = y * delCost
 106                 }
 107                 // fmt.Printf("%q -> %q: init doff=%d dlen=%d d[%d:%d]=%v\n", str1, str2, doff, dlen, doff, doff+dlen, d[doff:doff+dlen])
 108
 109                 for x := 0; x < l2; x++ {
 110                         dy, d[doff] = d[doff], d[doff]+insCost
 111                         for d[doff] > maxCost && dlen > 0 {
 112                                 if str1[doff] != str2[x] {
 113                                         dy += subCost
 114                                 }
 115                                 doff++
 116                                 dlen--
 117                                 if c = d[doff] + insCost; c < dy {
 118                                         dy = c
 119                                 }
 120                                 dy, d[doff] = d[doff], dy
 121                         }
 122                         for y, l = doff, doff+dlen-1; y < l; dy, d[y] = d[y], dy {
 123                                 if str1[y] != str2[x] {
 124                                         dy += subCost
 125                                 }
 126                                 if c = d[y] + delCost; c < dy {
 127                                         dy = c
 128                                 }
 129                                 y++
 130                                 if c = d[y] + insCost; c < dy {
 131                                         dy = c
 132                                 }
 133                         }
 134                         if y < l1 {
 135                                 if str1[y] != str2[x] {
 136                                         dy += subCost
 137                                 }
 138                                 if c = d[y] + delCost; c < dy {
 139                                         dy = c
 140                                 }
 141                                 for ; dy <= maxCost && y < l1; dy, d[y] = dy+delCost, dy {
 142                                         y++
 143                                         dlen++
 144                                 }
 145                         }
 146                         // fmt.Printf("%q -> %q: x=%d doff=%d dlen=%d d[%d:%d]=%v\n", str1, str2, x, doff, dlen, doff, doff+dlen, d[doff:doff+dlen])
 147                         if dlen == 0 {
 148                                 dist = maxCost + 1
 149                                 return
 150                         }
 151                 }
 152                 if doff+dlen-1 < l1 {
 153                         dist = maxCost + 1
 154                         return
 155                 }
 156                 dist = d[l1]
 157         } else {
 158                 // ToDo: This is O(l1*l2) time and O(min(l1,l2)) space; investigate if it is
 159                 // worth to implement diagonal approach - O(l1*(1+dist)) time, up to O(l1*l2) space
 160                 // http://www.csse.monash.edu.au/~lloyd/tildeStrings/Alignment/92.IPL.html
 161
 162                 // prefer the shorter string first, to minimize space; time is O(l1*l2) anyway;
 163                 // a swap also transposes the meanings of insertion and deletion.
 164                 if l1 > l2 {
 165                         str1, str2, l1, l2, insCost, delCost = str2, str1, l2, l1, delCost, insCost
 166                 }
 167                 d := make([]int, l1+1)
 168
 169                 for y = 1; y <= l1; y++ {
 170                         d[y] = y * delCost
 171                 }
 172                 for x := 0; x < l2; x++ {
 173                         dy, d[0] = d[0], d[0]+insCost
 174                         for y = 0; y < l1; dy, d[y] = d[y], dy {
 175                                 if str1[y] != str2[x] {
 176                                         dy += subCost
 177                                 }
 178                                 if c = d[y] + delCost; c < dy {
 179                                         dy = c
 180                                 }
 181                                 y++
 182                                 if c = d[y] + insCost; c < dy {
 183                                         dy = c
 184                                 }
 185                         }
 186                 }
 187                 dist = d[l1]
 188         }
 189
 190         return
 191 }
 192
 193 // Distance returns the Levenshtein distance between str1 and str2, using the
 194 // default or provided cost values. Pass nil for the third argument to use the
 195 // default cost of 1 for all three operations, with no maximum.
 196 func Distance(str1, str2 string, p *Params) int {
 197         if p == nil {
 198                 p = defaultParams
 199         }
 200         dist, _, _ := Calculate([]rune(str1), []rune(str2), p.maxCost, p.insCost, p.subCost, p.delCost)
 201         return dist
 202 }
 203
 204 // Similarity returns a score in the range of 0..1 for how similar the two strings are.
 205 // A score of 1 means the strings are identical, and 0 means they have nothing in common.
 206 //
 207 // A nil third argument uses the default cost of 1 for all three operations.
 208 //
 209 // If a non-zero MinScore value is provided in the parameters, scores lower than it
 210 // will be returned as 0.
 211 func Similarity(str1, str2 string, p *Params) float64 {
 212         return Match(str1, str2, p.Clone().BonusThreshold(1.1)) // guaranteed no bonus
 213 }
 214
 215 // Match returns a similarity score adjusted by the same method as proposed by Winkler for
 216 // the Jaro distance - giving a bonus to string pairs that share a common prefix, only if their
 217 // similarity score is already over a threshold.
 218 //
 219 // The score is in the range of 0..1, with 1 meaning the strings are identical,
 220 // and 0 meaning they have nothing in common.
 221 //
 222 // A nil third argument uses the default cost of 1 for all three operations, maximum length of
 223 // common prefix to consider for bonus of 4, scaling factor of 0.1, and bonus threshold of 0.7.
 224 //
 225 // If a non-zero MinScore value is provided in the parameters, scores lower than it
 226 // will be returned as 0.
 227 func Match(str1, str2 string, p *Params) float64 {
 228         s1, s2 := []rune(str1), []rune(str2)
 229         l1, l2 := len(s1), len(s2)
 230         // two empty strings are identical; shortcut also avoids divByZero issues later on.
 231         if l1 == 0 && l2 == 0 {
 232                 return 1
 233         }
 234
 235         if p == nil {
 236                 p = defaultParams
 237         }
 238
 239         // a min over 1 can never be satisfied, so the score is 0.
 240         if p.minScore > 1 {
 241                 return 0
 242         }
 243
 244         insCost, delCost, maxDist, max := p.insCost, p.delCost, 0, 0
 245         if l1 > l2 {
 246                 l1, l2, insCost, delCost = l2, l1, delCost, insCost
 247         }
 248
 249         if p.subCost < delCost+insCost {
 250                 maxDist = l1*p.subCost + (l2-l1)*insCost
 251         } else {
 252                 maxDist = l1*delCost + l2*insCost
 253         }
 254
 255         // a zero min is always satisfied, so no need to set a max cost.
 256         if p.minScore > 0 {
 257                 // if p.minScore is lower than p.bonusThreshold, we can use a simplified formula
 258                 // for the max cost, because a sim score below min cannot receive a bonus.
 259                 if p.minScore < p.bonusThreshold {
 260                         // round down the max - a cost equal to a rounded up max would already be under min.
 261                         max = int((1 - p.minScore) * float64(maxDist))
 262                 } else {
 263                         // p.minScore <= sim + p.bonusPrefix*p.bonusScale*(1-sim)
 264                         // p.minScore <= (1-dist/maxDist) + p.bonusPrefix*p.bonusScale*(1-(1-dist/maxDist))
 265                         // p.minScore <= 1 - dist/maxDist + p.bonusPrefix*p.bonusScale*dist/maxDist
 266                         // 1 - p.minScore >= dist/maxDist - p.bonusPrefix*p.bonusScale*dist/maxDist
 267                         // (1-p.minScore)*maxDist/(1-p.bonusPrefix*p.bonusScale) >= dist
 268                         max = int((1 - p.minScore) * float64(maxDist) / (1 - float64(p.bonusPrefix)*p.bonusScale))
 269                 }
 270         }
 271
 272         dist, pl, _ := Calculate(s1, s2, max, p.insCost, p.subCost, p.delCost)
 273         if max > 0 && dist > max {
 274                 return 0
 275         }
 276         sim := 1 - float64(dist)/float64(maxDist)
 277
 278         if sim >= p.bonusThreshold && sim < 1 && p.bonusPrefix > 0 && p.bonusScale > 0 {
 279                 if pl > p.bonusPrefix {
 280                         pl = p.bonusPrefix
 281                 }
 282                 sim += float64(pl) * p.bonusScale * (1 - sim)
 283         }
 284
 285         if sim < p.minScore {
 286                 return 0
 287         }
 288
 289         return sim
 290 }