001 (ns org.clojars.punit-naik.clj-ml.lsh
002 (:require [clojure.set :refer [subset?]]
003 [clojure.string :refer [join]]
004 [org.clojars.punit-naik.clj-ml.utils.generic :refer [shingles]]
005 [org.clojars.punit-naik.clj-ml.utils.string :refer [reversed-levenstein-distance]]
006 [æsahættr :refer [murmur3-128 hash->long hash-string]]))
007
008 (defn hash-n-times
009 "Hashes a shingles list `n` times"
010 [sh-list n]
011 (let [hashes (map murmur3-128 (range n))]
012 (map
013 (fn [s] (map (fn [h] (hash->long (hash-string h s))) hashes))
014 sh-list)))
015
016 (defn min-hash
017 "Takes the lists of hashed values (where all of them have the same size)
018 and finds the minimum hash value at the position ‘i’ from every list
019 thereby generating a single list of hash values which is the minhash signature of that string"
020 [hash-values]
021 (reduce #(map min %1 %2) hash-values))
022
023 (defn band-hash
024 "Takes the minhash signature of a string and partitions it according to `band-size`
025 Then we hash each \"band\" (partition) as similar strings will tend have at least one matching hashed band"
026 [band-size minhash-list]
027 (let [banded-minhash-list (partition-all band-size minhash-list)]
028 (map
029 (fn [m h] (hash->long (hash-string h (join "-" m))))
030 banded-minhash-list
031 (map murmur3-128 (range (count banded-minhash-list))))))
032
033 ;(defn find-candidate-subsets
034 ; "Given a list of tuples of (<str>,<hash>), tries to generate a a list of similar candidates"
035 ; [str-hash-pair-list]
036 ; (reduce (fn []) {} str-hash-pair-list))
037
038 (defn compare-records
039 "Compares a list of records/string with each other using `org.clojars.punit-naik.clj-ml.utils.string/reversed-levenstein-distance`"
040 [records]
041 (loop [[s1-idx s1] (first records)
042 s2-rest (rest records)
043 result #{}]
044 (if (empty? s2-rest)
045 result
046 (recur (first s2-rest)
047 (rest s2-rest)
048 (into result
049 (map
050 (fn [[s2-idx s2]]
051 {:original-index s1-idx
052 :original-data s1
053 :possible-duplicate-index s2-idx
054 :possible-duplicate-data s2
055 :match-percentage (reversed-levenstein-distance s1 s2)}) s2-rest))))))
056
057 (defn merge-candidates
058 [candidate-list]
059 (loop [c (first candidate-list)
060 cr (rest candidate-list)
061 result #{}]
062 (if (empty? cr)
063 result
064 (recur (first cr)
065 (rest cr)
066 (into result (filter #(subset? c %) cr))))))
067
068 (defn merge-candidates-recursive
069 [candidate-list]
070 (loop [current-result candidate-list
071 merged-result (merge-candidates current-result)]
072 (if (or (empty? merged-result)
073 (= (count merged-result) (count current-result)))
074 current-result
075 (recur merged-result (merge-candidates merged-result)))))
076
077 (defn find-possible-duplicates
078 "Takes a collection of strings (`data`) and finds out the similar strings from the collection"
079 [shingle-size hash-count band-size match-threshold data]
080 (->> (pmap #(-> %
081 (shingles shingle-size)
082 (hash-n-times hash-count)
083 min-hash) data)
084 (mapcat #(map (fn [bh] [(:index %1) (:data %1) bh]) (band-hash band-size %2))
085 (map-indexed (fn [idx d] {:index idx :data d}) data))
086 (group-by last)
087 ;; Not really interested in the key as it is present in the grouped data
088 vals
089 (pmap #(set (map butlast %)))
090 ;; Merging candidates as there might be multiple hash values that match for different strings
091 merge-candidates-recursive
092 ;; Generating candidates list
093 (pmap compare-records)
094 (apply concat)
095 (filter #(>= (:match-percentage %) match-threshold))))