diff options
| author | Alexander Pickering <alexandermpickering@gmail.com> | 2016-06-02 16:05:27 -0400 |
|---|---|---|
| committer | Alexander Pickering <alexandermpickering@gmail.com> | 2016-06-02 16:05:27 -0400 |
| commit | 40de7a8474e6013602d6ee38a6b1cedf1d9e14ce (patch) | |
| tree | 355f6650a46c09b6f1805e77e76e80384d7bd330 /fuzzel.lua | |
| download | fuzzel-40de7a8474e6013602d6ee38a6b1cedf1d9e14ce.tar.gz fuzzel-40de7a8474e6013602d6ee38a6b1cedf1d9e14ce.tar.bz2 fuzzel-40de7a8474e6013602d6ee38a6b1cedf1d9e14ce.zip | |
Initial commit
Diffstat (limited to 'fuzzel.lua')
| -rw-r--r-- | fuzzel.lua | 166 |
1 files changed, 166 insertions, 0 deletions
diff --git a/fuzzel.lua b/fuzzel.lua new file mode 100644 index 0000000..e95dcf7 --- /dev/null +++ b/fuzzel.lua @@ -0,0 +1,166 @@ +--[[ + Fuzzel v1.0 - Alexander "Apickx" Pickering + Entered into the public domain June 2, 2016 + You are not required to, but consider linking back to the source! + + Some helper functions for calculateing distance between two strings + + Provides: + fuzzel.LevenshtienDistance_extended(string_first, string_second, number_addcost, number_substituecost, number_deletecost) + Calculates the Levenshtien Distance between two strings, useing the costs given. "Real" Levenshtien Distance uses values 1,1,1 for costs. + returns number_distance + + fuzzel.LevenshtienDistance(string_first, strings_second) + Calculates the "real" Levenshtien Distance + returns number distance + + fuzzel.LevensteinRatio(string_first, string_second) + The Levenshtien Ratio divided by the first string's length. Useing a ratio is a decent way to determin if a spelling is "close enough" + returns number_distance + + fuzzel.DamerauLevenshtienDistance_extended(string_first, string_second, number_addcost, number_substituecost, number_deletecost, number_transpositioncost) + Damerau-Levenshtien Distance is almost exactly like Levenshtien Distance, with the caveat that two letters next to each other, with swapped positions only counts as "one" cost (in "real" Damerau-Levenshtien Distance) + returns number + + fuzzel.DamerauLevenshtienDistance(stirng_first, strings_second) + Calculates the "real" Damerau-Levenshtien Distance + returns number + + fuzzel.DamerauLevenshtienRatio(string_first, string_second) + The Damerau-Levenshtien Distance divided by the first string's length + returns number + + fuzzel.HammingDistance(string_first, string_second) + Purely the number of substitutions needed to change one string into another. Note that both strings must be the same length. + returns number + + fuzzel.HammingRatio(string_first, string_second) + The hamming distance divided by the length of the first string + returns number + + fuzzel.FuzzySearchDistance(string_needle, vararg_in) + in may be either a table, or a list of arguments. fuzzel.FuzzySearchDistance will find the string that most closely resembles needle, based on Damerau-Levenshtien Distance + returns string_closest, number_distance + + fuzzel.FuzzySearchRatio(string_needle, vararg_in) + in may be either a table, or a list of arguments. Same as above, except it returns the string with the closest Damerau-Levenshtien ratio. + returns string_closest, nubmer_ratio +]] +local strlen,chrat,min,asrt,prs,iprs,typ,upack = string.len,string.byte,math.min,assert,pairs,ipairs,type,unpack + +module("fuzzel") + +local function genericDistance( stringa, stringb, addcost, subcost, delcost, ...) + --Length of each string + local salen, sblen = strlen(stringa), strlen(stringb) + + --Create a 0 matrix the size of len(a) x len(b) + local dyntbl = {} + for i = 0,salen do + dyntbl[i] = {} + for j = 0,sblen do + dyntbl[i][j] = 0 + end + end + + --Initalize the matrix + for i = 1,salen do + dyntbl[i][0] = i + end + for j = 1,sblen do + dyntbl[0][j] = j + end + + --And build up the matrix based on costs-so-far + for j = 1,sblen do + for i = 1,salen do + local ca = chrat(stringa,i) + local cb = chrat(stringb,j) + dyntbl[i][j] = min( + dyntbl[i-1][j] + delcost, --deletion + dyntbl[i][j-1] + addcost, --insertion + dyntbl[i-1][j-1] + (ca == cb and 0 or subcost) --substituion + ) + if arg[1] and i > 1 and j > 1 and ca == chrat(stringb,j-1) and chrat(stringa,i-1) == cb then + dyntbl[i][j] = min(dyntbl[i][j], + dyntbl[i-2][j-2] + (ca == cb and 0 or arg[2])) --transposition + end + end + end + + return dyntbl[salen][sblen] +end + +function LevenshtienDistance_extended(stringa, stringb, addcost, subcost, delcost) + return genericDistance(stringa, stringb, addcost, subcost, delcost) +end + +function LevenshtienDistance(stringa,stringb) + return LevenshtienDistance_extended(stringa,stringb,1,1,1) +end + +--The distance as a ratio of stringa's length +function LevenshteinRatio(stringa,stringb) + return LevenshtienDistance(stringa,stringb) / strlen(stringa) +end +--Almost the same as LevenshtienDistance, but considers two characters swaped as only "one" mistake +function DamerauLevenshtienDistance_extended(stringa, stringb, addcost, subcost, delcost, trncost) + return genericDistance(stringa,stringb,addcost,subcost,delcost,true,trncost) +end + +function DamerauLevenshtienDistance(stringa,stringb) + return DamerauLevenshtienDistance_extended(stringa,stringb,1,1,1,1) +end + +function DamerauLevenshtienRatio(stringa,stringb) + return DamerauLevenshtienDistance(stringa,stringb) / strlen(stringa) +end + +--Purely the number of mistakes +function HammingDistance(stringa,stringb) + local len = strlen(stringa) + asrt(len == strlen(stringb),"Hamming Distance cannot be calculated on two strings of different lengths:\"" .. stringa .. "\" \"" .. stringb .. "\"") + local dist = 0 + for i = 1,len do + dist = dist + (chrat(stringa,i) ~= chrat(stringb,i) and 1) + end + return dist +end + +function HammingRatio(stringa,stringb) + return HammingDistance(stringa,stringb) / strlen(stringa) +end + +local function FuzzySearch(str,func,...) + local itrfunc = typ(arg[1]) == "table" and prs or iprs + local tmin = func(arg[1],str) + local sout = arg[1] + for k,v in itrfunc(arg) do + local t = func(v,str) + if t < tmin then + tmin = t + sout = v + end + end + return sout, tmin +end + +function FuzzySearchDistance(str,...) + return upack{FuzzySearch(str,DamerauLevenshtienDistance,...)} +end + +function FuzzySearchRatio(str,...) + return upack{FuzzySearch(str,DamerauLevenshtienRatio,...)} +end + +--Some easy-to-use mnemonics +ld_e = LevenshtienDistance_extended +ld = LevenshtienDistance +lr = LevensteinRatio +dld_e = DamerauLevenshtienDistance_extended +dld = DamerauLevenshtienDistance +dlr = DamerauLevenshtienRatio +hd = HammingDistance +hr = HammingRatio +fsd = FuzzySearchDistance +fsr = FuzzySearchRatio |
