001  (ns org.clojars.punit-naik.clj-ml.utils.string)
002  
003  (defn indexed-string
004    [string]
005    (map-indexed vector string))
006  
007  (defn match-char-to-str
008    "Matches a characted `c` with all characters of `string`
009     Also takes in the position of char `c` i.e. `c-pos` in it's original string
010     And an `edit-map` initial value from previous stages"
011    [edit-map-init c-pos c string]
012    (let [c-pos-zero? (zero? c-pos)]
013      (reduce (fn [[edit-distance edm]
014                   [string-char-pos string-char]]
015                (let [e (if (= c string-char) 0 1)
016                      string-char-pos-zero? (zero? string-char-pos)
017                      new-edit-distance (min
018                                         (if string-char-pos-zero?
019                                           (+ c-pos 2)
020                                           (inc edit-distance))
021                                         (if c-pos-zero?
022                                           (+ string-char-pos 2)
023                                           (inc (edm [(dec c-pos) string-char-pos])))
024                                         (+ (cond
025                                              c-pos-zero? string-char-pos
026                                              string-char-pos-zero? c-pos
027                                              :else (edm [(dec c-pos) (dec string-char-pos)]))
028                                            (if (and c-pos-zero? string-char-pos-zero?) 0 e)))]
029                  [new-edit-distance (assoc edm [c-pos string-char-pos] new-edit-distance)]))
030            [(if (zero? c-pos) 0 c-pos) edit-map-init]
031            (indexed-string string))))
032  
033  (defn match-strings
034    "Matches two strings `s1` and `s2` and finds out their `edit distance` and `similarity`"
035    [s1 s2]
036    (reduce (fn [{:keys [edit-map] :as m} [string-char-pos string-char]]
037              (let [[edit-distance edit-distance-map] (match-char-to-str edit-map string-char-pos string-char s2)]
038                (assoc m :edit-map edit-distance-map :similarity edit-distance)))
039            {:edit-map {}}
040            (indexed-string s1)))
041  
042  (defn reversed-levenstein-distance
043    "Finds out the revered levenstein distance (percentage of match)
044     between two strings `s1` and `s2` by normalising the edit distance"
045    [s1 s2]
046    (let [{:keys [similarity]} (match-strings s1 s2)]
047      (double (- 100 (* 100 (/ similarity (+ (count s1) (count s2))))))))