aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--fuzzel.lua166
-rw-r--r--fuzzel_min.lua2
2 files changed, 168 insertions, 0 deletions
diff --git a/fuzzel.lua b/fuzzel.lua
new file mode 100644
index 0000000..e95dcf7
--- /dev/null
+++ b/fuzzel.lua
@@ -0,0 +1,166 @@
+--[[
+ Fuzzel v1.0 - Alexander "Apickx" Pickering
+ Entered into the public domain June 2, 2016
+ You are not required to, but consider linking back to the source!
+
+ Some helper functions for calculateing distance between two strings
+
+ Provides:
+ fuzzel.LevenshtienDistance_extended(string_first, string_second, number_addcost, number_substituecost, number_deletecost)
+ Calculates the Levenshtien Distance between two strings, useing the costs given. "Real" Levenshtien Distance uses values 1,1,1 for costs.
+ returns number_distance
+
+ fuzzel.LevenshtienDistance(string_first, strings_second)
+ Calculates the "real" Levenshtien Distance
+ returns number distance
+
+ fuzzel.LevensteinRatio(string_first, string_second)
+ The Levenshtien Ratio divided by the first string's length. Useing a ratio is a decent way to determin if a spelling is "close enough"
+ returns number_distance
+
+ fuzzel.DamerauLevenshtienDistance_extended(string_first, string_second, number_addcost, number_substituecost, number_deletecost, number_transpositioncost)
+ Damerau-Levenshtien Distance is almost exactly like Levenshtien Distance, with the caveat that two letters next to each other, with swapped positions only counts as "one" cost (in "real" Damerau-Levenshtien Distance)
+ returns number
+
+ fuzzel.DamerauLevenshtienDistance(stirng_first, strings_second)
+ Calculates the "real" Damerau-Levenshtien Distance
+ returns number
+
+ fuzzel.DamerauLevenshtienRatio(string_first, string_second)
+ The Damerau-Levenshtien Distance divided by the first string's length
+ returns number
+
+ fuzzel.HammingDistance(string_first, string_second)
+ Purely the number of substitutions needed to change one string into another. Note that both strings must be the same length.
+ returns number
+
+ fuzzel.HammingRatio(string_first, string_second)
+ The hamming distance divided by the length of the first string
+ returns number
+
+ fuzzel.FuzzySearchDistance(string_needle, vararg_in)
+ in may be either a table, or a list of arguments. fuzzel.FuzzySearchDistance will find the string that most closely resembles needle, based on Damerau-Levenshtien Distance
+ returns string_closest, number_distance
+
+ fuzzel.FuzzySearchRatio(string_needle, vararg_in)
+ in may be either a table, or a list of arguments. Same as above, except it returns the string with the closest Damerau-Levenshtien ratio.
+ returns string_closest, nubmer_ratio
+]]
+local strlen,chrat,min,asrt,prs,iprs,typ,upack = string.len,string.byte,math.min,assert,pairs,ipairs,type,unpack
+
+module("fuzzel")
+
+local function genericDistance( stringa, stringb, addcost, subcost, delcost, ...)
+ --Length of each string
+ local salen, sblen = strlen(stringa), strlen(stringb)
+
+ --Create a 0 matrix the size of len(a) x len(b)
+ local dyntbl = {}
+ for i = 0,salen do
+ dyntbl[i] = {}
+ for j = 0,sblen do
+ dyntbl[i][j] = 0
+ end
+ end
+
+ --Initalize the matrix
+ for i = 1,salen do
+ dyntbl[i][0] = i
+ end
+ for j = 1,sblen do
+ dyntbl[0][j] = j
+ end
+
+ --And build up the matrix based on costs-so-far
+ for j = 1,sblen do
+ for i = 1,salen do
+ local ca = chrat(stringa,i)
+ local cb = chrat(stringb,j)
+ dyntbl[i][j] = min(
+ dyntbl[i-1][j] + delcost, --deletion
+ dyntbl[i][j-1] + addcost, --insertion
+ dyntbl[i-1][j-1] + (ca == cb and 0 or subcost) --substituion
+ )
+ if arg[1] and i > 1 and j > 1 and ca == chrat(stringb,j-1) and chrat(stringa,i-1) == cb then
+ dyntbl[i][j] = min(dyntbl[i][j],
+ dyntbl[i-2][j-2] + (ca == cb and 0 or arg[2])) --transposition
+ end
+ end
+ end
+
+ return dyntbl[salen][sblen]
+end
+
+function LevenshtienDistance_extended(stringa, stringb, addcost, subcost, delcost)
+ return genericDistance(stringa, stringb, addcost, subcost, delcost)
+end
+
+function LevenshtienDistance(stringa,stringb)
+ return LevenshtienDistance_extended(stringa,stringb,1,1,1)
+end
+
+--The distance as a ratio of stringa's length
+function LevenshteinRatio(stringa,stringb)
+ return LevenshtienDistance(stringa,stringb) / strlen(stringa)
+end
+--Almost the same as LevenshtienDistance, but considers two characters swaped as only "one" mistake
+function DamerauLevenshtienDistance_extended(stringa, stringb, addcost, subcost, delcost, trncost)
+ return genericDistance(stringa,stringb,addcost,subcost,delcost,true,trncost)
+end
+
+function DamerauLevenshtienDistance(stringa,stringb)
+ return DamerauLevenshtienDistance_extended(stringa,stringb,1,1,1,1)
+end
+
+function DamerauLevenshtienRatio(stringa,stringb)
+ return DamerauLevenshtienDistance(stringa,stringb) / strlen(stringa)
+end
+
+--Purely the number of mistakes
+function HammingDistance(stringa,stringb)
+ local len = strlen(stringa)
+ asrt(len == strlen(stringb),"Hamming Distance cannot be calculated on two strings of different lengths:\"" .. stringa .. "\" \"" .. stringb .. "\"")
+ local dist = 0
+ for i = 1,len do
+ dist = dist + (chrat(stringa,i) ~= chrat(stringb,i) and 1)
+ end
+ return dist
+end
+
+function HammingRatio(stringa,stringb)
+ return HammingDistance(stringa,stringb) / strlen(stringa)
+end
+
+local function FuzzySearch(str,func,...)
+ local itrfunc = typ(arg[1]) == "table" and prs or iprs
+ local tmin = func(arg[1],str)
+ local sout = arg[1]
+ for k,v in itrfunc(arg) do
+ local t = func(v,str)
+ if t < tmin then
+ tmin = t
+ sout = v
+ end
+ end
+ return sout, tmin
+end
+
+function FuzzySearchDistance(str,...)
+ return upack{FuzzySearch(str,DamerauLevenshtienDistance,...)}
+end
+
+function FuzzySearchRatio(str,...)
+ return upack{FuzzySearch(str,DamerauLevenshtienRatio,...)}
+end
+
+--Some easy-to-use mnemonics
+ld_e = LevenshtienDistance_extended
+ld = LevenshtienDistance
+lr = LevensteinRatio
+dld_e = DamerauLevenshtienDistance_extended
+dld = DamerauLevenshtienDistance
+dlr = DamerauLevenshtienRatio
+hd = HammingDistance
+hr = HammingRatio
+fsd = FuzzySearchDistance
+fsr = FuzzySearchRatio
diff --git a/fuzzel_min.lua b/fuzzel_min.lua
new file mode 100644
index 0000000..d2cec2c
--- /dev/null
+++ b/fuzzel_min.lua
@@ -0,0 +1,2 @@
+--This file has been minified! For the original, see cogarr.net/source/
+local a,b,c,d,e,f,g,h=string.len,string.byte,math.min,assert,pairs,ipairs,type,unpack;module("fuzzel")function LevenshtienDistance_extended(i,j,k,l,m,...)local n,o=a(i),a(j)local p={}for q=0,n do p[q]={}for r=0,o do p[q][r]=0 end end;for q=1,n do p[q][0]=q end;for r=1,o do p[0][r]=r end;for r=1,o do for q=1,n do local s=b(i,q)local t=b(j,r)p[q][r]=c(p[q-1][r]+m,p[q][r-1]+k,p[q-1][r-1]+(s==t and 0 or l))if arg[1]and q>1 and r>1 and s==b(j,r-1)and b(i,q-1)==t then p[q][r]=c(p[q][r],p[q-2][r-2]+(s==t and 0 or arg[2]))end end end;return p[n][o]end;function LevenshtienDistance(i,j)return LevenshtienDistance_extended(i,j,1,1,1,false,0)end;function LevenshteinRatio(i,j)return LevenshtienDistance(i,j)/a(i)end;function DamerauLevenshtienDistance_extended(i,j,k,l,m,u)return LevenshtienDistance_extended(i,j,k,l,m,true,u)end;function DamerauLevenshtienDistance(i,j)return DamerauLevenshtienDistance_extended(i,j,1,1,1,1)end;function DamerauLevenshtienRatio(i,j)return DamerauLevenshtienDistance(i,j)/a(i)end;function HammingDistance(i,j)local v=a(i)d(v==a(j),"Hamming Distance cannot be calculated on two strings of different lengths:\""..i.."\" \""..j.."\"")local w=0;for q=1,v do w=w+(b(i,q)~=b(j,q)and 1)end;return w end;function HammingRatio(i,j)return HammingDistance(i,j)/a(i)end;local function x(y,z,...)local A=g(arg[1])=="table"and e or f;local B=z(arg[1],y)local C=arg[1]for D,E in A(arg)do local F=z(E,y)if F<B then B=F;C=E end end;return C,B end;function FuzzySearchDistance(y,...)return h{x(y,DamerauLevenshtienDistance,...)}end;function FuzzySearchRatio(y,...)return h{x(y,DamerauLevenshtienRatio,...)}end;ld_e=LevenshtienDistance_extended;ld=LevenshtienDistance;lr=LevensteinRatio;dld_e=DamerauLevenshtienDistance_extended;dld=DamerauLevenshtienDistance;dlr=DamerauLevenshtienRatio;hd=HammingDistance;hr=HammingRatio;fsd=FuzzySearchDistance;fsr=FuzzySearchRatio