--[[ Fuzzel v1.0 - Alexander "Apickx" Pickering Entered into the public domain June 2, 2016 You are not required to, but consider linking back to the source! Some helper functions for calculateing distance between two strings Provides: fuzzel.LevenshtienDistance_extended(string_first, string_second, number_addcost, number_substituecost, number_deletecost) Calculates the Levenshtien Distance between two strings, useing the costs given. "Real" Levenshtien Distance uses values 1,1,1 for costs. returns number_distance fuzzel.LevenshtienDistance(string_first, strings_second) Calculates the "real" Levenshtien Distance returns number_distance fuzzel.LevensteinRatio(string_first, string_second) The Levenshtien Ratio divided by the first string's length. Useing a ratio is a decent way to determin if a spelling is "close enough" returns number_distance fuzzel.DamerauLevenshtienDistance_extended(string_first, string_second, number_addcost, number_substituecost, number_deletecost, number_transpositioncost) Damerau-Levenshtien Distance is almost exactly like Levenshtien Distance, with the caveat that two letters next to each other, with swapped positions only counts as "one" cost (in "real" Damerau-Levenshtien Distance) returns number fuzzel.DamerauLevenshtienDistance(stirng_first, strings_second) Calculates the "real" Damerau-Levenshtien Distance returns number fuzzel.DamerauLevenshtienRatio(string_first, string_second) The Damerau-Levenshtien Distance divided by the first string's length returns number_ratio fuzzel.HammingDistance(string_first, string_second) Purely the number of substitutions needed to change one string into another. Note that both strings must be the same length. returns number_distance fuzzel.HammingRatio(string_first, string_second) The hamming distance divided by the length of the first string returns number_ratio fuzzel.FuzzySearchDistance(string_needle, vararg_in) in may be either a table, or a list of arguments. fuzzel.FuzzySearchDistance will find the string that most closely resembles needle, based on Damerau-Levenshtien Distance returns string_closest, number_distance fuzzel.FuzzySearchRatio(string_needle, vararg_in) in may be either a table, or a list of arguments. Same as above, except it returns the string with the closest Damerau-Levenshtien ratio. returns string_closest, nubmer_ratio Example: Returns a function that will return the closest string to the string it was passed -----------------FuzzelExample.lua------------------ local fuzzel = dofile("fuzzel.lua") function suggestoption(tbl_options) return function(str) local closest = fuzzel.FuzzySearchDistance(str,tbl_options) return closest end end local options = { "Fat Cat", "Lazy Dog", "Brown Fox", } local suggestfunc = suggestoption(options) suggestfunc("Brown Cat") -------------End FuzzelExample.lua------------------ ]] local strlen,chrat,min,asrt,prs,iprs,typ,upack = string.len,string.byte,math.min,assert,pairs,ipairs,type,unpack local fuzzel = {} local function genericDistance( stringa, stringb, addcost, subcost, delcost, ...) --Length of each string local salen, sblen = strlen(stringa), strlen(stringb) --Create a 0 matrix the size of len(a) x len(b) local dyntbl = {} for i = 0,salen do dyntbl[i] = {} for j = 0,sblen do dyntbl[i][j] = 0 end end --Initalize the matrix for i = 1,salen do dyntbl[i][0] = i end for j = 1,sblen do dyntbl[0][j] = j end --And build up the matrix based on costs-so-far for j = 1,sblen do for i = 1,salen do local ca = chrat(stringa,i) local cb = chrat(stringb,j) dyntbl[i][j] = min( dyntbl[i-1][j] + delcost, --deletion dyntbl[i][j-1] + addcost, --insertion dyntbl[i-1][j-1] + (ca == cb and 0 or subcost) --substituion ) if arg[1] and i > 1 and j > 1 and ca == chrat(stringb,j-1) and chrat(stringa,i-1) == cb then dyntbl[i][j] = min(dyntbl[i][j], dyntbl[i-2][j-2] + (ca == cb and 0 or arg[2])) --transposition end end end return dyntbl[salen][sblen] end function fuzzel.LevenshtienDistance_extended(stringa, stringb, addcost, subcost, delcost) return fuzzel.genericDistance(stringa, stringb, addcost, subcost, delcost) end function fuzzel.LevenshtienDistance(stringa,stringb) return fuzzel.LevenshtienDistance_extended(stringa,stringb,1,1,1) end --The distance as a ratio of stringa's length function fuzzel.LevenshteinRatio(stringa,stringb) return fuzzel.LevenshtienDistance(stringa,stringb) / strlen(stringa) end --Almost the same as LevenshtienDistance, but considers two characters swaped as only "one" mistake function fuzzel.DamerauLevenshtienDistance_extended(stringa, stringb, addcost, subcost, delcost, trncost) return genericDistance(stringa,stringb,addcost,subcost,delcost,true,trncost) end function fuzzel.DamerauLevenshtienDistance(stringa,stringb) return fuzzel.DamerauLevenshtienDistance_extended(stringa,stringb,1,1,1,1) end function fuzzel.DamerauLevenshtienRatio(stringa,stringb) return fuzzel.DamerauLevenshtienDistance(stringa,stringb) / strlen(stringa) end --Purely the number of mistakes function fuzzel.HammingDistance(stringa,stringb) local len = strlen(stringa) asrt(len == strlen(stringb),"Hamming Distance cannot be calculated on two strings of different lengths:\"" .. stringa .. "\" \"" .. stringb .. "\"") local dist = 0 for i = 1,len do dist = dist + (chrat(stringa,i) ~= chrat(stringb,i) and 1) end return dist end function fuzzel.HammingRatio(stringa,stringb) return fuzzel.HammingDistance(stringa,stringb) / strlen(stringa) end local function FuzzySearch(str,func,...) local looparg = typ(arg[1]) == "table" and arg[1] or arg local tmin = func(looparg[1],str) local sout = looparg[1] for k,v in prs(looparg) do local t = func(v,str) if t < tmin then tmin = t sout = v end end return sout, tmin end function fuzzel.FuzzySearchDistance(str,...) return upack{FuzzySearch(str,fuzzel.DamerauLevenshtienDistance,...)} end function fuzzel.FuzzySearchRatio(str,...) return upack{FuzzySearch(str,fuzzel.DamerauLevenshtienRatio,...)} end --Some easy-to-use mnemonics fuzzel.ld_e = fuzzel.LevenshtienDistance_extended fuzzel.ld = fuzzel.LevenshtienDistance fuzzel.lr = fuzzel.LevensteinRatio fuzzel.dld_e = fuzzel.DamerauLevenshtienDistance_extended fuzzel.dld = fuzzel.DamerauLevenshtienDistance fuzzel.dlr = fuzzel.DamerauLevenshtienRatio fuzzel.hd = fuzzel.HammingDistance fuzzel.hr = fuzzel.HammingRatio fuzzel.fsd = fuzzel.FuzzySearchDistance fuzzel.fsr = fuzzel.FuzzySearchRatio return fuzzel