--[[ Fuzzel v1.0 - Alexander "Apickx" Pickering Entered into the public domain June 2, 2016 You are not required to, but consider linking back to the source! Some helper functions for calculateing distance between two strings Provides: fuzzel.LevenshtienDistance_extended(string_first, string_second, number_addcost, number_substituecost, number_deletecost) Calculates the Levenshtien Distance between two strings, useing the costs given. "Real" Levenshtien Distance uses values 1,1,1 for costs. returns number_distance fuzzel.LevenshtienDistance(string_first, strings_second) Calculates the "real" Levenshtien Distance returns number_distance fuzzel.LevensteinRatio(string_first, string_second) The Levenshtien Ratio divided by the first string's length. Useing a ratio is a decent way to determin if a spelling is "close enough" returns number_distance fuzzel.DamerauLevenshtienDistance_extended(string_first, string_second, number_addcost, number_substituecost, number_deletecost, number_transpositioncost) Damerau-Levenshtien Distance is almost exactly like Levenshtien Distance, with the caveat that two letters next to each other, with swapped positions only counts as "one" cost (in "real" Damerau-Levenshtien Distance) returns number fuzzel.DamerauLevenshtienDistance(stirng_first, strings_second) Calculates the "real" Damerau-Levenshtien Distance returns number fuzzel.DamerauLevenshtienRatio(string_first, string_second) The Damerau-Levenshtien Distance divided by the first string's length returns number_ratio fuzzel.HammingDistance(string_first, string_second) Purely the number of substitutions needed to change one string into another. Note that both strings must be the same length. returns number_distance fuzzel.HammingRatio(string_first, string_second) The hamming distance divided by the length of the first string returns number_ratio fuzzel.FuzzyFindDistance(string_needle, vararg_in) in may be either a table, or a list of arguments. fuzzel.FuzzySearchDistance will find the string that most closely resembles needle, based on Damerau-Levenshtien Distance returns string_closest, number_distance fuzzel.FuzzyFindRatio(string_needle, vararg_in) in may be either a table, or a list of arguments. Same as above, except it returns the string with the closest Damerau-Levenshtien ratio. returns string_closest, nubmer_ratio fuzzel.FuzzySortDistance(string_needle, vararg_in) Sorts either the table, or the arguments, and returns a table. Uses Damerau-Levenshtien Distance returns table_sorted fuzzel.FuzzySortRatio(string needle, vararg_in) Same as above, but uses Damerau-Levenshtien Ratio instead returns table_sorted Example: Returns a function that will return the closest string to the string it was passed -----------------FuzzelExample.lua------------------ --Include the module local fuzzel = require("fuzzel.lua") --A couple of options local options = { "Fat Cat", "Lazy Dog", "Brown Fox", } --And use it, to see what option closest matches "Brown Cat" print("\"Lulzy Cat\" is close to \"" .. fuzzel.FuzzyFindDistance("Lulzy Cat",options) .. "\"") --Sort the options to see the order in which they most closely match "Brown Cat" print("\"Frag God\" is closest to:") for k,v in ipairs(fuzzel.FuzzySortRatio("Frag God",options)) do print(k .. "\t:\t" .. v) end -------------End FuzzelExample.lua------------------ Outputs: "Lulzy Cat" is close to "Fat Cat" "Frag God" is closest to: 1 : Fat Cat 2 : Lazy Dog 3 : Brown Fox Some easy-to-use mnemonics fuzzel.ld_e = fuzzel.LevenshtienDistance_extended fuzzel.ld = fuzzel.LevenshtienDistance fuzzel.lr = fuzzel.LevensteinRatio fuzzel.dld_e = fuzzel.DamerauLevenshtienDistance_extended fuzzel.dld = fuzzel.DamerauLevenshtienDistance fuzzel.dlr = fuzzel.DamerauLevenshtienRatio fuzzel.hd = fuzzel.HammingDistance fuzzel.hr = fuzzel.HammingRatio fuzzel.ffd = fuzzel.FuzzyFindDistance fuzzel.ffr = fuzzel.FuzzyFindRatio fuzzel.fsd = fuzzel.FuzzySortDistance fuzzel.fsr = fuzzel.FuzzySortRatio ]] local strlen,chrat,min,asrt,prs,iprs,typ,upack,tblins,tblsrt = string.len,string.byte,math.min,assert,pairs,ipairs,type,unpack,table.insert,table.sort local fuzzel = {} local function genericDistance( stringa, stringb, addcost, subcost, delcost, ...) --Length of each string local salen, sblen = strlen(stringa), strlen(stringb) --Create a 0 matrix the size of len(a) x len(b) local dyntbl = {} for i = 0,salen do dyntbl[i] = {} for j = 0,sblen do dyntbl[i][j] = 0 end end --Initalize the matrix for i = 1,salen do dyntbl[i][0] = i end for j = 1,sblen do dyntbl[0][j] = j end --And build up the matrix based on costs-so-far for j = 1,sblen do for i = 1,salen do local ca = chrat(stringa,i) local cb = chrat(stringb,j) dyntbl[i][j] = min( dyntbl[i-1][j] + delcost, --deletion dyntbl[i][j-1] + addcost, --insertion dyntbl[i-1][j-1] + (ca == cb and 0 or subcost) --substituion ) if arg[1] and i > 1 and j > 1 and ca == chrat(stringb,j-1) and chrat(stringa,i-1) == cb then dyntbl[i][j] = min(dyntbl[i][j], dyntbl[i-2][j-2] + (ca == cb and 0 or arg[2])) --transposition end end end return dyntbl[salen][sblen] end function fuzzel.LevenshtienDistance_extended(stringa, stringb, addcost, subcost, delcost) return fuzzel.genericDistance(stringa, stringb, addcost, subcost, delcost) end fuzzel.ld_e = fuzzel.LevenshtienDistance_extended function fuzzel.LevenshtienDistance(stringa,stringb) return fuzzel.ld_e(stringa,stringb,1,1,1) end fuzzel.ld = fuzzel.LevenshtienDistance function fuzzel.LevenshteinRatio(stringa,stringb) return fuzzel.ld(stringa,stringb) / strlen(stringa) end fuzzel.lr = fuzzel.LevensteinRatio function fuzzel.DamerauLevenshtienDistance_extended(stringa, stringb, addcost, subcost, delcost, trncost) return genericDistance(stringa,stringb,addcost,subcost,delcost,true,trncost) end fuzzel.dld_e = fuzzel.DamerauLevenshtienDistance_extended function fuzzel.DamerauLevenshtienDistance(stringa,stringb) return fuzzel.dld_e(stringa,stringb,1,1,1,1) end fuzzel.dld = fuzzel.DamerauLevenshtienDistance function fuzzel.DamerauLevenshtienRatio(stringa,stringb) return fuzzel.dld(stringa,stringb) / strlen(stringa) end fuzzel.dlr = fuzzel.DamerauLevenshtienRatio function fuzzel.HammingDistance(stringa,stringb) local len = strlen(stringa) asrt(len == strlen(stringb),"Hamming Distance cannot be calculated on two strings of different lengths:\"" .. stringa .. "\" \"" .. stringb .. "\"") local dist = 0 for i = 1,len do dist = dist + (chrat(stringa,i) ~= chrat(stringb,i) and 1) end return dist end fuzzel.hd = fuzzel.HammingDistance function fuzzel.HammingRatio(stringa,stringb) return fuzzel.HammingDistance(stringa,stringb) / strlen(stringa) end fuzzel.hr = fuzzel.HammingRatio local function FuzzySearch(str,func,...) local looparg = typ(arg[1]) == "table" and arg[1] or arg local tmin = func(looparg[1],str) local sout = looparg[1] for k,v in prs(looparg) do local t = func(v,str) if t < tmin then tmin = t sout = v end end return sout, tmin end function fuzzel.FuzzyFindDistance(str,...) return upack{FuzzySearch(str,fuzzel.dld,...)} end fuzzel.ffd = fuzzel.FuzzyFindDistance function fuzzel.FuzzyFindRatio(str,...) return upack{FuzzySearch(str,fuzzel.dlr,...)} end local function FuzzySort(str, func, ...) local looparg = typ(arg[1]) == "table" and arg[1] or arg local usorted = {} for k,v in prs(looparg) do local dist = func(str,v) usorted[dist] = usorted[dist] or {} tblins(usorted[dist],v) end local sorted = {} for k,v in prs(usorted) do tblins(sorted,k) end tblsrt(sorted) local otbl = {} for k,v in prs(sorted) do for i,j in prs(usorted[v]) do tblins(otbl,j) end end return otbl end fuzzel.ffr = fuzzel.FuzzyFindRatio function fuzzel.FuzzySortDistance(str,...) return upack{FuzzySort(str,fuzzel.dld,...)} end fuzzel.fsd = fuzzel.FuzzySortDistance function fuzzel.FuzzySortRatio(str,...) return upack{FuzzySort(str,fuzzel.dlr,...)} end fuzzel.fsr = fuzzel.FuzzySortRatio return fuzzel