addon/lib/fuzzy_match/algorithims.py

#!/usr/bin/env python
# encoding: utf-8
import math
from math import floor, ceil
import re
from collections import Counter
# import numpy as np


def find_ngrams(string, split_num=3):
    """
    Slice string into ngrams.
    Returns array of ngrams for the given string.

    Arguments:
        text: the string to find ngrams for.
        split_num: the length the ngrams should be. Defaults to 3 (trigrams).
    """
    try:
        if not string:
            return set()

        words = ['  {} '.format(x) for x in re.split(r'\W+', str(string).lower()) if x.strip()]

        ngrams = set()

        for word in words:
            for x in range(0, len(word) - split_num + 1):
                ngrams.add(word[x:x+split_num])

        return ngrams

    except:
        return None


def trigram(text1, text2, split_num=3):
    """
    Find the similarity between two strings using ngrams.
    Returns float score value, 0.0 being completely different strings and 1.0 being equal strings.

    Arguments:
        text1: main string to compare against.
        text2: second string to compare to text1.
        split_num: the length the ngrams should be. Defaults to 3 (trigrams).
    """
    try:
        ngrams1 = find_ngrams(text1, split_num)
        ngrams2 = find_ngrams(text2, split_num)

        num_unique = len(ngrams1 | ngrams2)
        num_equal = len(ngrams1 & ngrams2)

        score = round(float(num_equal) / float(num_unique), 6)

        return score

    except:
        return None


def cosine(text1, text2):
    """
    Find the similarity between two strings using cosine vectors.
    Returns float score value, 0.0 being completely different strings and 1.0 being equal strings.

    Arguments:
        text1: main string to compare against.
        text2: second string to compare to text1.
    """
    try:
        vec1 = Counter(re.compile(r"\w+").findall(text1))
        vec2 = Counter(re.compile(r"\w+").findall(text2))
        intersection = set(vec1.keys()) & set(vec2.keys())
        numerator = sum([vec1[x] * vec2[x] for x in intersection])

        sum1 = sum([vec1[x] ** 2 for x in list(vec1.keys())])
        sum2 = sum([vec2[x] ** 2 for x in list(vec2.keys())])
        denominator = math.sqrt(sum1) * math.sqrt(sum2)

        if not denominator:
            return 0.0
        else:
            return float(numerator) / denominator

    except:
        return None


def levenshtein(text1, text2):
    """
    Find the similarity between two strings using Levenshtein distance.
    Returns float score value, 0.0 being completely different strings and 1.0 being equal strings.

    Arguments:
        text1: main string to compare against.
        text2: second string to compare to text1.
    """
    try:
        size_x = len(text1) + 1
        size_y = len(text2) + 1
        matrix = np.zeros ((size_x, size_y))
        for x in range(size_x):
            matrix [x, 0] = x
        for y in range(size_y):
            matrix [0, y] = y

        for x in range(1, size_x):
            for y in range(1, size_y):
                if text1[x-1] == text2[y-1]:
                    matrix [x,y] = min(
                        matrix[x-1, y] + 1,
                        matrix[x-1, y-1],
                        matrix[x, y-1] + 1
                    )
                else:
                    matrix [x,y] = min(
                        matrix[x-1,y] + 1,
                        matrix[x-1,y-1] + 1,
                        matrix[x,y-1] + 1
                    )
        distance = matrix[size_x - 1, size_y - 1]
        score = (max(len(text1), len(text2)) - distance) / max(len(text1), len(text2))
        return float(score)

    except:
        return None


def jaro_winkler(s1, s2):
    """
    Find the similarity between two strings using Jaro-Winkler distance.
    Returns float score value, 0.0 being completely different strings and 1.0 being equal strings.

    Arguments:
        text1: main string to compare against.
        text2: second string to compare to text1.
    """
    try:
        if (s1 == s2):
            return 1.0

        len1 = len(s1)
        len2 = len(s2)
        max_dist = floor(max(len1, len2) / 2) - 1
        match = 0
        hash_s1 = [0] * len(s1)
        hash_s2 = [0] * len(s2)

        for i in range(len1):
            for j in range(max(0, i - max_dist),
                        min(len2, i + max_dist + 1)):

                if (s1[i] == s2[j] and hash_s2[j] == 0):
                    hash_s1[i] = 1
                    hash_s2[j] = 1
                    match += 1
                    break

        if (match == 0):
            return 0.0

        t = 0
        point = 0

        for i in range(len1):
            if (hash_s1[i]):

                while (hash_s2[point] == 0):
                    point += 1

                if (s1[i] != s2[point]):
                    point += 1
                    t += 1
        t = t//2

        return float(match/ len1 + match / len2 +
                (match - t + 1) / match)/ 3.0
    except:
        return None