/*
* Copyright (C) 2011 Collabora Ltd.
*
* This library is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 2.1 of the License, or
* (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this library. If not, see .
*
* Authors:
* Raul Gutierrez Segales
*/
using Gee;
/**
* Likely-ness of a potential match.
*
* Note that the order should be maintained.
*
* @since 0.5.0
*/
public enum Folks.MatchResult
{
/**
* Very low likelihood of a match.
*/
VERY_LOW,
/**
* Low likelihood of a match.
*/
LOW,
/**
* Medium likelihood of a match.
*/
MEDIUM,
/**
* High likelihood of a match.
*/
HIGH,
/**
* Very high likelihood of a match.
*/
VERY_HIGH,
/**
* Minimum likelihood of a match.
*/
MIN = VERY_LOW,
/**
* Maximum likelihood of a match.
*/
MAX = VERY_HIGH
}
/**
* This class provides functionality to explore a potential match between
* two individuals.
*
* @since 0.5.0
*/
public class Folks.PotentialMatch : Object
{
MatchResult _result;
private Folks.Individual _individual_a;
private Folks.Individual _individual_b;
/**
* A set of e-mail addresses known to be aliases of each other, such as
* various forms of administrator address.
*
* @since 0.5.1
*/
public static Set known_email_aliases = null;
private static double _DIST_THRESHOLD = 0.70;
private const string _SEPARATORS = "._-+";
public PotentialMatch ()
{
if (this.known_email_aliases == null)
{
this.known_email_aliases = new Gee.HashSet (str_hash,
str_equal);
this.known_email_aliases.add ("admin");
this.known_email_aliases.add ("abuse");
this.known_email_aliases.add ("webmaster");
}
}
/**
* Whether two individuals are likely to be the same person.
*
* @param a an individual to compare
* @param b another individual to compare
*
* @since 0.5.0
*/
public MatchResult potential_match (Individual a, Individual b)
{
this._individual_a = a;
this._individual_b = b;
this._result = MatchResult.MIN;
if (this._individual_a.gender != Gender.UNSPECIFIED &&
this._individual_b.gender != Gender.UNSPECIFIED &&
this._individual_a.gender != this._individual_b.gender)
{
return this._result;
}
/* If individuals share common im-addresses */
this._inspect_im_addresses ();
if (this._result == MatchResult.MAX)
return this._result;
/* If individuals share common e-mails */
this._inspect_emails ();
if (this._result == MatchResult.MAX)
return this._result;
/* If individuals share common phone numbers */
this._inspect_phone_numbers ();
if (this._result == MatchResult.MAX)
return this._result;
/* they have the same (normalised) name? */
this._name_similarity ();
if (this._result == MatchResult.MAX)
return this._result;
return this._result;
}
private void _inspect_phone_numbers ()
{
var set_a = this._individual_a.phone_numbers;
var set_b = this._individual_b.phone_numbers;
foreach (var phone_fd_a in set_a)
{
foreach (var phone_fd_b in set_b)
{
if (phone_fd_a.equal (phone_fd_b))
{
this._result = MatchResult.HIGH;
return;
}
}
}
}
/* Approach:
* - taking in account family, given, prefix, suffix and additional names
* we give some points for each non-empty match
*
* @since 0.5.0
*/
private void _name_similarity ()
{
Folks.StructuredName a = this._individual_a.structured_name;
Folks.StructuredName b = this._individual_b.structured_name;
double similarity = 0.0;
if (this._look_alike (this._individual_a.nickname,
this._individual_b.nickname))
{
similarity += 0.20;
}
if (this._look_alike (this._individual_a.full_name,
this._individual_b.full_name))
{
similarity += 0.70;
}
if (a != null && b != null)
{
if (a.is_empty () == false && a.equal (b))
{
this._result = MatchResult.HIGH;
return;
}
if (Folks.Utils._str_equal_safe (a.given_name, b.given_name))
similarity += 0.20;
if (this._look_alike (a.family_name, b.family_name) &&
this._look_alike (a.given_name, b.given_name))
{
similarity += 0.40;
}
if (Folks.Utils._str_equal_safe (a.additional_names,
b.additional_names))
similarity += 0.5;
if (Folks.Utils._str_equal_safe (a.prefixes, b.prefixes))
similarity += 0.5;
if (Folks.Utils._str_equal_safe (a.suffixes, b.suffixes))
similarity += 0.5;
}
debug ("[name_similarity] Got %f\n", similarity);
if (similarity >= this._DIST_THRESHOLD)
this._result = this._inc_match_level (this._result, 2);
}
/**
* Number of equal IM addresses between two individuals.
*
* @since 0.5.0
*/
public void _inspect_im_addresses ()
{
foreach (var proto in this._individual_a.im_addresses.get_keys ())
{
var addrs_a = this._individual_a.im_addresses.get (proto);
var addrs_b = this._individual_b.im_addresses.get (proto);
foreach (var im_a in addrs_a)
{
if (addrs_b.contains (im_a))
{
this._result = MatchResult.HIGH;
return;
}
}
}
}
/**
* Inspect email addresses.
*
* @since 0.5.0
*/
private void _inspect_emails ()
{
var set_a = this._individual_a.email_addresses;
var set_b = this._individual_b.email_addresses;
foreach (var fd_a in set_a)
{
foreach (var fd_b in set_b)
{
string[] email_split_a = fd_a.value.split ("@");
string[] email_split_b = fd_b.value.split ("@");
if (fd_a.value == fd_b.value)
{
if (this.known_email_aliases.contains
(email_split_a[0]) == true)
{
if (this._result < MatchResult.HIGH)
{
this._result = MatchResult.LOW;
}
}
else
{
this._result = MatchResult.HIGH;
return;
}
}
else
{
string[] tokens_a =
email_split_a[0].split_set (this._SEPARATORS);
string[] tokens_b =
email_split_b[0].split_set (this._SEPARATORS);
/* Do we have: first.middle.last@ ~= fml@ ? */
if (this._check_initials_expansion (tokens_a, tokens_b))
{
this._result = MatchResult.MEDIUM;
}
/* So we have splitted the user part of the e-mail
* address into tokens. Lets see if there is some
* matches between tokens.
* As in: first.middle.last@ ~= [first,middle,..]@ */
else if (this._match_tokens (tokens_a, tokens_b))
{
this._result = MatchResult.MEDIUM;
}
}
}
}
}
/* We are after:
* you.are.someone@ =~ yas@
*/
private bool _check_initials_expansion (string[] tokens_a, string[] tokens_b)
{
if (tokens_a.length > tokens_b.length &&
tokens_b.length == 1)
{
return this._do_check_initials_expansion (tokens_a, tokens_b[0]);
}
else if (tokens_b.length > tokens_a.length &&
tokens_a.length == 1)
{
return this._do_check_initials_expansion (tokens_b, tokens_a[0]);
}
return false;
}
private bool _do_check_initials_expansion (string[] expanded_name,
string initials)
{
if (expanded_name.length != initials.length)
return false;
for (int i=0; i tokens_b.length)
return this._do_match_tokens (tokens_a, tokens_b);
else
return this._do_match_tokens (tokens_b, tokens_a);
}
private bool _do_match_tokens (string[] bigger_set, string[] smaller_set)
{
for (var i=0; i < smaller_set.length; i++)
{
for (var j=0; j < bigger_set.length; j++)
{
if (smaller_set[i] == bigger_set[j])
return true;
}
}
return false;
}
private MatchResult _inc_match_level (
MatchResult current_level, int times = 1)
{
MatchResult ret = current_level + times;
if (ret > MatchResult.MAX)
ret = MatchResult.MAX;
return ret;
}
private bool _look_alike (string? a, string? b)
{
if (a == null || b == null)
{
return false;
}
bool alike = this.jaro_dist (a, b) >= this._DIST_THRESHOLD ? true : false;
return alike;
}
/* Based on:
* http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance
*
* d = 1/3 * ( m/|s1| + m/|s2| + (m - t)/m )
*
* where
*
* m = matching characters
* t = number of transpositions
*/
private double jaro_dist (string s1, string s2)
{
double distance;
int max = s1.length > s2.length ? s1.length : s2.length;
int max_dist = (max / 2) - 1;
double t;
double m = (double) this._matches (s1, s2, max_dist, out t);
double len_s1 = (double) s1.length;
double len_s2 = (double) s2.length;
double a = m / len_s1;
double b = m / len_s2;
double c = 0;
if ((int) m > 0)
c = (m - t) / m;
distance = (1.0/3.0) * (a + b + c);
debug ("[jaro_dist] Distance for %s and %s: %f\n", s1, s2, distance);
return distance;
}
/* Calculate matches and transpositions as defined by the Jaro distance.
*/
private int _matches (string s1, string s2, int max_dist, out double t)
{
int matches = 0;
t = 0.0;
for (int i=0; i < s1.length; i++)
{
var look_for = s1.slice (i, i + 1);
int contains = this._contains (s2, look_for, i, max_dist);
if (contains >= 0)
{
matches++;
if (contains > 0)
t += 1.0;
}
}
debug ("%s and %s have %d matches and %f / 2 transpositions\n",
s1, s2, matches, t);
t = t / 2.0;
return matches;
}
/* If haystack contains c in pos return 0, if it contains
* it withing the bounds of max_dist return abs(pos-pos_found).
* If its not found, return -1. */
private int _contains (string haystack, string c, int pos, int max_dist)
{
if (pos < haystack.length && haystack.slice (pos, pos + 1) == c)
return 0;
for (int i=pos-max_dist; i <= pos + max_dist; i++)
{
if (i < 0 || i >= haystack.length)
continue;
var str = haystack.slice (i, i + 1);
if (str == c)
return (pos - i).abs ();
}
return -1;
}
}