--[[ Fuzzel v1.0 - Alexander "Apickx" Pickering Entered into the public domain June 2, 2016 You are not required to, but consider putting a link to the source in your file's comments! Some helper functions for calculateing distance between two strings Provides: fuzzel.LevenshteinDistance_extended(string_first, string_second, number_addcost, number_substituecost, number_deletecost) Calculates the Levenshtein Distance between two strings, useing the costs given. "Real" Levenshtein Distance uses values 1,1,1 for costs. returns number_distance fuzzel.LevenshteinDistance(string_first, strings_second) Calculates the "real" Levenshtein Distance returns number_distance fuzzel.LevensteinRatio(string_first, string_second) The Levenshtein Ratio divided by the first string's length. Useing a ratio is a decent way to determin if a spelling is "close enough" returns number_distance fuzzel.DamerauLevenshteinDistance_extended(string_first, string_second, number_addcost, number_substituecost, number_deletecost, number_transpositioncost) Damerau-Levenshtein Distance is almost exactly like Levenshtein Distance, with the caveat that two letters next to each other, with swapped positions only counts as "one" cost (in "real" Damerau-Levenshtein Distance) returns number fuzzel.DamerauLevenshteinDistance(stirng_first, strings_second) Calculates the "real" Damerau-Levenshtein Distance returns number fuzzel.DamerauLevenshteinRatio(string_first, string_second) The Damerau-Levenshtein Distance divided by the first string's length returns number_ratio fuzzel.HammingDistance(string_first, string_second) Purely the number of substitutions needed to change one string into another. Note that both strings must be the same length. returns number_distance fuzzel.HammingRatio(string_first, string_second) The hamming distance divided by the length of the first string returns number_ratio fuzzel.FuzzyFindDistance(string_needle, vararg_in) in may be either a table, or a list of arguments. fuzzel.FuzzySearchDistance will find the string that most closely resembles needle, based on Damerau-Levenshtein Distance returns string_closest, number_distance fuzzel.FuzzyFindRatio(string_needle, vararg_in) in may be either a table, or a list of arguments. Same as above, except it returns the string with the closest Damerau-Levenshtein ratio. returns string_closest, nubmer_ratio fuzzel.FuzzySortDistance(string_needle, vararg_in) Sorts either the table, or the arguments, and returns a table. Uses Damerau-Levenshtein Distance returns table_sorted fuzzel.FuzzySortRatio(string needle, vararg_in) Same as above, but uses Damerau-Levenshtein Ratio instead returns table_sorted Example: Returns a function that will return the closest string to the string it was passed -----------------FuzzelExample.lua------------------ --Include the module local fuzzel = require("fuzzel.lua") --A couple of options local options = { "Fat Cat", "Lazy Dog", "Brown Fox", } --And use it, to see what option closest matches "Brown Cat" local close,distance = fuzzel.FuzzyFindDistance("Lulzy Cat", options) print("\"Lulzy Cat\" is close to \"" .. close .. "\", distance:" .. distance) --Sort the options to see the order in which they most closely match "Brown Cat" print("\"Frag God\" is closest to:") for k,v in ipairs(fuzzel.FuzzySortRatio("Frag God",options)) do print(k .. "\t:\t" .. v) end -------------End FuzzelExample.lua------------------ Outputs: "Lulzy Cat" is close to "Fat Cat" "Frag God" is closest to: 1 : Fat Cat 2 : Lazy Dog 3 : Brown Fox Some easy-to-use mnemonics fuzzel.ld_e = fuzzel.LevenshteinDistance_extended fuzzel.ld = fuzzel.LevenshteinDistance fuzzel.lr = fuzzel.LevensteinRatio fuzzel.dld_e = fuzzel.DamerauLevenshteinDistance_extended fuzzel.dld = fuzzel.DamerauLevenshteinDistance fuzzel.dlr = fuzzel.DamerauLevenshteinRatio fuzzel.hd = fuzzel.HammingDistance fuzzel.hr = fuzzel.HammingRatio fuzzel.ffd = fuzzel.FuzzyFindDistance fuzzel.ffr = fuzzel.FuzzyFindRatio fuzzel.fsd = fuzzel.FuzzySortDistance fuzzel.fsr = fuzzel.FuzzySortRatio ]]--You probably don't want to touch anything past this point --Assign locals to these to the minifier can compress the file better local strlen,chrat,min,asrt,prs,iprs,typ,upack,tblins,tblsrt = string.len,string.byte,math.min,assert,pairs,ipairs,type,unpack,table.insert,table.sort local fuzzel = {} --A clever way to allow the minifier to minify function names, this basically just assigns variables with their string equivalent. local da, le, di, ra, fu, fi, so, ex, ha = "Damerau", "Levenshtein", "Distance", "Ratio", "Fuzzy", "Find", "Sort", "_extended", "Hamming" local LevenshteinDistance_extended,LevenshteinDistance,LevenshteinRatio,DamerauLevenshteinDistance_extended,DamerauLevenshteinDistance,DamerauLevenshteinRatio,FuzzyFindDistance,FuzzyFindRatio,FuzzySortDistance,FuzzySortRatio,HammingDistance,HammingRatio = le..di..ex,le..di,le..ra,da..le..di..ex,da..le..di,da..le..ra,fu..fi..di,fu..fi..ra,fu..so..di,fu..so..ra,ha..di,ha..ra local function genericDistance( stringa, stringb, addcost, subcost, delcost, ...) --Length of each string local salen, sblen = strlen(stringa), strlen(stringb) --Create a 0 matrix the size of len(a) x len(b) local dyntbl = {} for i = 0,salen do dyntbl[i] = {} for j = 0,sblen do dyntbl[i][j] = 0 end end --Initalize the matrix for i = 1,salen do dyntbl[i][0] = i end for j = 1,sblen do dyntbl[0][j] = j end --And build up the matrix based on costs-so-far for j = 1,sblen do for i = 1,salen do local ca = chrat(stringa,i) local cb = chrat(stringb,j) dyntbl[i][j] = min( dyntbl[i-1][j] + delcost, --deletion dyntbl[i][j-1] + addcost, --insertion dyntbl[i-1][j-1] + (ca == cb and 0 or subcost) --substituion ) if arg[1] and i > 1 and j > 1 and ca == chrat(stringb,j-1) and chrat(stringa,i-1) == cb then dyntbl[i][j] = min(dyntbl[i][j], dyntbl[i-2][j-2] + (ca == cb and 0 or arg[2])) --transposition end end end return dyntbl[salen][sblen] end fuzzel[LevenshteinDistance_extended] = function(stringa, stringb, addcost, subcost, delcost) return genericDistance(stringa, stringb, addcost, subcost, delcost) end fuzzel.ld_e = fuzzel[LevenshteinDistance_extended] fuzzel[LevenshteinDistance] = function(stringa,stringb) return fuzzel.ld_e(stringa,stringb,1,1,1) end fuzzel.ld = fuzzel[LevenshteinDistance] fuzzel[LevenshteinRatio] = function(stringa,stringb) return fuzzel.ld(stringa,stringb) / strlen(stringa) end fuzzel.lr = fuzzel[LevenshteinRatio] fuzzel[DamerauLevenshteinDistance_extended] = function(stringa, stringb, addcost, subcost, delcost, trncost) return genericDistance(stringa,stringb,addcost,subcost,delcost,true,trncost) end fuzzel.dld_e = fuzzel[DamerauLevenshteinDistance_extended] fuzzel[DamerauLevenshteinDistance] = function(stringa,stringb) return fuzzel.dld_e(stringa,stringb,1,1,1,1) end fuzzel.dld = fuzzel[DamerauLevenshteinDistance] fuzzel[DamerauLevenshteinRatio] = function(stringa,stringb) return fuzzel.dld(stringa,stringb) / strlen(stringa) end fuzzel.dlr = fuzzel[DamerauLevenshteinRatio] fuzzel[HammingDistance] = function(stringa,stringb) local len = strlen(stringa) asrt(len == strlen(stringb),"Hamming Distance cannot be calculated on two strings of different lengths:\"" .. stringa .. "\" \"" .. stringb .. "\"") local dist = 0 for i = 1,len do dist = dist + ((chrat(stringa,i) ~= chrat(stringb,i)) and 1 or 0) end return dist end fuzzel.hd = fuzzel[HammingDistance] fuzzel[HammingRatio] = function(stringa,stringb) return fuzzel.hd(stringa,stringb) / strlen(stringa) end fuzzel.hr = fuzzel[HammingRatio] local function FuzzySearch(str,func,...) --Allow varargs, or a table local looparg = typ(arg[1]) == "table" and arg[1] or arg --Find the string with the shortest distance to the string we were supplied local tmin = func(looparg[1],str) local sout = looparg[1] for k,v in prs(looparg) do local t = func(v,str) if t < tmin then tmin = t sout = v end end return sout, tmin end fuzzel[FuzzyFindDistance] = function(str,...) return upack{FuzzySearch(str,fuzzel.dld,...)} end fuzzel.ffd = fuzzel[FuzzyFindDistance] fuzzel[FuzzyFindRatio] = function(str,...) return upack{FuzzySearch(str,fuzzel.dlr,...)} end local function FuzzySort(str, func, ...) --allow varargs, or a table local looparg = typ(arg[1]) == "table" and arg[1] or arg --Roughly sort everything by it's distance to the string local usorted = {} local sorted = {} for k,v in prs(looparg) do local dist = func(str,v) if usorted[dist] == nil then usorted[dist] = {} tblins(sorted,dist) end tblins(usorted[dist],v) end --Actually sort them into something can can be iterated with ipairs tblsrt(sorted) --Then build our output table local otbl = {} for k,v in iprs(sorted) do for i,j in prs(usorted[v]) do tblins(otbl,j) end end return otbl end fuzzel.ffr = fuzzel[FuzzyFindRatio] fuzzel[FuzzySortDistance] = function(str,...) return FuzzySort(str,fuzzel.dld,...) end fuzzel.fsd = fuzzel[FuzzySortDistance] fuzzel[FuzzySortRatio] = function(str,...) return FuzzySort(str,fuzzel.dlr,...) end fuzzel.fsr = fuzzel[FuzzySortRatio] return fuzzel