--[[ Fuzzel v1.0 - Alexander "Apickx" Pickering Entered into the public domain June 2, 2016 You are not required to, but consider linking back to the source! Some helper functions for calculateing distance between two strings Provides: fuzzel.LevenshtienDistance_extended(string_first, string_second, number_addcost, number_substituecost, number_deletecost) Calculates the Levenshtien Distance between two strings, useing the costs given. "Real" Levenshtien Distance uses values 1,1,1 for costs. returns number_distance fuzzel.LevenshtienDistance(string_first, strings_second) Calculates the "real" Levenshtien Distance returns number distance fuzzel.LevensteinRatio(string_first, string_second) The Levenshtien Ratio divided by the first string's length. Useing a ratio is a decent way to determin if a spelling is "close enough" returns number_distance fuzzel.DamerauLevenshtienDistance_extended(string_first, string_second, number_addcost, number_substituecost, number_deletecost, number_transpositioncost) Damerau-Levenshtien Distance is almost exactly like Levenshtien Distance, with the caveat that two letters next to each other, with swapped positions only counts as "one" cost (in "real" Damerau-Levenshtien Distance) returns number fuzzel.DamerauLevenshtienDistance(stirng_first, strings_second) Calculates the "real" Damerau-Levenshtien Distance returns number fuzzel.DamerauLevenshtienRatio(string_first, string_second) The Damerau-Levenshtien Distance divided by the first string's length returns number fuzzel.HammingDistance(string_first, string_second) Purely the number of substitutions needed to change one string into another. Note that both strings must be the same length. returns number fuzzel.HammingRatio(string_first, string_second) The hamming distance divided by the length of the first string returns number fuzzel.FuzzySearchDistance(string_needle, vararg_in) in may be either a table, or a list of arguments. fuzzel.FuzzySearchDistance will find the string that most closely resembles needle, based on Damerau-Levenshtien Distance returns string_closest, number_distance fuzzel.FuzzySearchRatio(string_needle, vararg_in) in may be either a table, or a list of arguments. Same as above, except it returns the string with the closest Damerau-Levenshtien ratio. returns string_closest, nubmer_ratio ]] local strlen,chrat,min,asrt,prs,iprs,typ,upack = string.len,string.byte,math.min,assert,pairs,ipairs,type,unpack module("fuzzel") local function genericDistance( stringa, stringb, addcost, subcost, delcost, ...) --Length of each string local salen, sblen = strlen(stringa), strlen(stringb) --Create a 0 matrix the size of len(a) x len(b) local dyntbl = {} for i = 0,salen do dyntbl[i] = {} for j = 0,sblen do dyntbl[i][j] = 0 end end --Initalize the matrix for i = 1,salen do dyntbl[i][0] = i end for j = 1,sblen do dyntbl[0][j] = j end --And build up the matrix based on costs-so-far for j = 1,sblen do for i = 1,salen do local ca = chrat(stringa,i) local cb = chrat(stringb,j) dyntbl[i][j] = min( dyntbl[i-1][j] + delcost, --deletion dyntbl[i][j-1] + addcost, --insertion dyntbl[i-1][j-1] + (ca == cb and 0 or subcost) --substituion ) if arg[1] and i > 1 and j > 1 and ca == chrat(stringb,j-1) and chrat(stringa,i-1) == cb then dyntbl[i][j] = min(dyntbl[i][j], dyntbl[i-2][j-2] + (ca == cb and 0 or arg[2])) --transposition end end end return dyntbl[salen][sblen] end function LevenshtienDistance_extended(stringa, stringb, addcost, subcost, delcost) return genericDistance(stringa, stringb, addcost, subcost, delcost) end function LevenshtienDistance(stringa,stringb) return LevenshtienDistance_extended(stringa,stringb,1,1,1) end --The distance as a ratio of stringa's length function LevenshteinRatio(stringa,stringb) return LevenshtienDistance(stringa,stringb) / strlen(stringa) end --Almost the same as LevenshtienDistance, but considers two characters swaped as only "one" mistake function DamerauLevenshtienDistance_extended(stringa, stringb, addcost, subcost, delcost, trncost) return genericDistance(stringa,stringb,addcost,subcost,delcost,true,trncost) end function DamerauLevenshtienDistance(stringa,stringb) return DamerauLevenshtienDistance_extended(stringa,stringb,1,1,1,1) end function DamerauLevenshtienRatio(stringa,stringb) return DamerauLevenshtienDistance(stringa,stringb) / strlen(stringa) end --Purely the number of mistakes function HammingDistance(stringa,stringb) local len = strlen(stringa) asrt(len == strlen(stringb),"Hamming Distance cannot be calculated on two strings of different lengths:\"" .. stringa .. "\" \"" .. stringb .. "\"") local dist = 0 for i = 1,len do dist = dist + (chrat(stringa,i) ~= chrat(stringb,i) and 1) end return dist end function HammingRatio(stringa,stringb) return HammingDistance(stringa,stringb) / strlen(stringa) end local function FuzzySearch(str,func,...) local itrfunc = typ(arg[1]) == "table" and prs or iprs local tmin = func(arg[1],str) local sout = arg[1] for k,v in itrfunc(arg) do local t = func(v,str) if t < tmin then tmin = t sout = v end end return sout, tmin end function FuzzySearchDistance(str,...) return upack{FuzzySearch(str,DamerauLevenshtienDistance,...)} end function FuzzySearchRatio(str,...) return upack{FuzzySearch(str,DamerauLevenshtienRatio,...)} end --Some easy-to-use mnemonics ld_e = LevenshtienDistance_extended ld = LevenshtienDistance lr = LevensteinRatio dld_e = DamerauLevenshtienDistance_extended dld = DamerauLevenshtienDistance dlr = DamerauLevenshtienRatio hd = HammingDistance hr = HammingRatio fsd = FuzzySearchDistance fsr = FuzzySearchRatio