001 (ns org.clojars.punit-naik.clj-ml.utils.string)
002
003 (defn indexed-string
004 [string]
005 (map-indexed vector string))
006
007 (defn match-char-to-str
008 "Matches a characted `c` with all characters of `string`
009 Also takes in the position of char `c` i.e. `c-pos` in it's original string
010 And an `edit-map` initial value from previous stages"
011 [edit-map-init c-pos c string]
012 (let [c-pos-zero? (zero? c-pos)]
013 (reduce (fn [[edit-distance edm]
014 [string-char-pos string-char]]
015 (let [e (if (= c string-char) 0 1)
016 string-char-pos-zero? (zero? string-char-pos)
017 new-edit-distance (min
018 (if string-char-pos-zero?
019 (+ c-pos 2)
020 (inc edit-distance))
021 (if c-pos-zero?
022 (+ string-char-pos 2)
023 (inc (edm [(dec c-pos) string-char-pos])))
024 (+ (cond
025 c-pos-zero? string-char-pos
026 string-char-pos-zero? c-pos
027 :else (edm [(dec c-pos) (dec string-char-pos)]))
028 (if (and c-pos-zero? string-char-pos-zero?) 0 e)))]
029 [new-edit-distance (assoc edm [c-pos string-char-pos] new-edit-distance)]))
030 [(if (zero? c-pos) 0 c-pos) edit-map-init]
031 (indexed-string string))))
032
033 (defn match-strings
034 "Matches two strings `s1` and `s2` and finds out their `edit distance` and `similarity`"
035 [s1 s2]
036 (reduce (fn [{:keys [edit-map] :as m} [string-char-pos string-char]]
037 (let [[edit-distance edit-distance-map] (match-char-to-str edit-map string-char-pos string-char s2)]
038 (assoc m :edit-map edit-distance-map :similarity edit-distance)))
039 {:edit-map {}}
040 (indexed-string s1)))
041
042 (defn reversed-levenstein-distance
043 "Finds out the revered levenstein distance (percentage of match)
044 between two strings `s1` and `s2` by normalising the edit distance"
045 [s1 s2]
046 (let [{:keys [similarity]} (match-strings s1 s2)]
047 (double (- 100 (* 100 (/ similarity (+ (count s1) (count s2))))))))