2 * Copyright (C) 2011 Collabora Ltd.
4 * This library is free software: you can redistribute it and/or modify
5 * it under the terms of the GNU Lesser General Public License as published by
6 * the Free Software Foundation, either version 2.1 of the License, or
7 * (at your option) any later version.
9 * This library is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU Lesser General Public License for more details.
14 * You should have received a copy of the GNU Lesser General Public License
15 * along with this library. If not, see <http://www.gnu.org/licenses/>.
18 * Raul Gutierrez Segales <raul.gutierrez.segales@collabora.co.uk>
24 * Likely-ness of a potential match.
26 * Note that the order should be maintained.
30 public enum Folks.MatchResult
33 * Very low likelihood of a match.
38 * Low likelihood of a match.
43 * Medium likelihood of a match.
48 * High likelihood of a match.
53 * Very high likelihood of a match.
58 * Minimum likelihood of a match.
63 * Maximum likelihood of a match.
69 * This class provides functionality to explore a potential match between
74 public class Folks.PotentialMatch : Object
77 private Folks.Individual _individual_a;
78 private Folks.Individual _individual_b;
81 * A set of e-mail addresses known to be aliases of each other, such as
82 * various forms of administrator address.
86 public static Set<string> known_email_aliases =
87 new Gee.HashSet<string> (str_hash, str_equal);
89 private static double _DIST_THRESHOLD = 0.70;
90 private const string _SEPARATORS = "._-+";
94 PotentialMatch.known_email_aliases.add ("admin");
95 PotentialMatch.known_email_aliases.add ("abuse");
96 PotentialMatch.known_email_aliases.add ("webmaster");
100 * Whether two individuals are likely to be the same person.
102 * @param a an individual to compare
103 * @param b another individual to compare
107 public MatchResult potential_match (Individual a, Individual b)
109 this._individual_a = a;
110 this._individual_b = b;
111 this._result = MatchResult.MIN;
113 if (this._individual_a.gender != Gender.UNSPECIFIED &&
114 this._individual_b.gender != Gender.UNSPECIFIED &&
115 this._individual_a.gender != this._individual_b.gender)
120 /* If individuals share common im-addresses */
121 this._inspect_im_addresses ();
122 if (this._result == MatchResult.MAX)
125 /* If individuals share common e-mails */
126 this._inspect_emails ();
127 if (this._result == MatchResult.MAX)
130 /* If individuals share common phone numbers */
131 this._inspect_phone_numbers ();
132 if (this._result == MatchResult.MAX)
135 /* they have the same (normalised) name? */
136 this._name_similarity ();
137 if (this._result == MatchResult.MAX)
143 private void _inspect_phone_numbers ()
145 var set_a = this._individual_a.phone_numbers;
146 var set_b = this._individual_b.phone_numbers;
148 foreach (var phone_fd_a in set_a)
150 foreach (var phone_fd_b in set_b)
152 if (phone_fd_a.values_equal (phone_fd_b))
154 this._result = MatchResult.HIGH;
162 * - taking in account family, given, prefix, suffix and additional names
163 * we give some points for each non-empty match
167 private void _name_similarity ()
169 double similarity = 0.0;
170 bool exact_match = false;
172 if (this._look_alike (this._individual_a.nickname,
173 this._individual_b.nickname))
178 if (this._look_alike_or_identical (this._individual_a.full_name,
179 this._individual_b.full_name,
181 this._look_alike_or_identical (this._individual_a.alias,
182 this._individual_b.full_name,
184 this._look_alike_or_identical (this._individual_a.full_name,
185 this._individual_b.alias,
187 this._look_alike_or_identical (this._individual_a.alias,
188 this._individual_b.alias,
194 var _a = this._individual_a.structured_name;
195 var _b = this._individual_b.structured_name;
197 if (_a != null && _b != null)
202 if (a.is_empty () == false && a.equal (b))
204 this._result = MatchResult.HIGH;
208 if (Folks.Utils._str_equal_safe (a.given_name, b.given_name))
211 if (this._look_alike (a.family_name, b.family_name) &&
212 this._look_alike (a.given_name, b.given_name))
217 if (Folks.Utils._str_equal_safe (a.additional_names,
221 if (Folks.Utils._str_equal_safe (a.prefixes, b.prefixes))
224 if (Folks.Utils._str_equal_safe (a.suffixes, b.suffixes))
228 debug ("[name_similarity] Got %f\n", similarity);
230 if (similarity >= this._DIST_THRESHOLD)
233 /* We need exact matches to go to at least HIGH, or otherwise its
234 not possible to get a HIGH match for e.g. a facebook telepathy
235 persona, where alias is the only piece of information
239 this._result = this._inc_match_level (this._result, inc);
244 * Number of equal IM addresses between two individuals.
246 * This compares the addresses without comparing their associated protocols.
250 public void _inspect_im_addresses ()
252 var addrs = new HashSet<string> ();
254 foreach (var im_a in this._individual_a.im_addresses.get_values ())
256 addrs.add (im_a.value);
259 foreach (var im_b in this._individual_b.im_addresses.get_values ())
261 if (addrs.contains (im_b.value) == true)
263 this._result = MatchResult.HIGH;
270 * Inspect email addresses.
274 private void _inspect_emails ()
276 var set_a = this._individual_a.email_addresses;
277 var set_b = this._individual_b.email_addresses;
279 foreach (var fd_a in set_a)
281 string[] email_split_a = fd_a.value.split ("@");
283 /* Sanity check for valid e-mail addresses. */
284 if (email_split_a.length < 2)
286 warning ("Invalid e-mail address when looking for potential " +
287 "match: %s", fd_a.value);
292 email_split_a[0].split_set (this._SEPARATORS);
294 foreach (var fd_b in set_b)
296 string[] email_split_b = fd_b.value.split ("@");
298 /* Sanity check for valid e-mail addresses. */
299 if (email_split_b.length < 2)
301 warning ("Invalid e-mail address when looking for " +
302 "potential match: %s", fd_b.value);
306 if (fd_a.value == fd_b.value)
308 if (PotentialMatch.known_email_aliases.contains
309 (email_split_a[0]) == true)
311 if (this._result < MatchResult.HIGH)
313 this._result = MatchResult.LOW;
318 this._result = MatchResult.HIGH;
325 email_split_b[0].split_set (this._SEPARATORS);
327 /* Do we have: first.middle.last@ ~= fml@ ? */
328 if (this._check_initials_expansion (tokens_a, tokens_b))
330 this._result = MatchResult.MEDIUM;
332 /* So we have splitted the user part of the e-mail
333 * address into tokens. Lets see if there is some
334 * matches between tokens.
335 * As in: first.middle.last@ ~= [first,middle,..]@ */
336 else if (this._match_tokens (tokens_a, tokens_b))
338 this._result = MatchResult.MEDIUM;
346 * you.are.someone@ =~ yas@
348 private bool _check_initials_expansion (string[] tokens_a, string[] tokens_b)
350 if (tokens_a.length > tokens_b.length &&
351 tokens_b.length == 1)
353 return this._do_check_initials_expansion (tokens_a, tokens_b[0]);
355 else if (tokens_b.length > tokens_a.length &&
356 tokens_a.length == 1)
358 return this._do_check_initials_expansion (tokens_b, tokens_a[0]);
363 private bool _do_check_initials_expansion (string[] expanded_name,
366 if (expanded_name.length != initials.length)
369 for (int i=0; i<expanded_name.length; i++)
371 if (expanded_name[i][0] != initials[i])
379 * We should probably count how many tokens matched?
381 private bool _match_tokens (string[] tokens_a, string[] tokens_b)
383 /* To find matching items from 2 sets its more efficient
384 * to make the outer loop go with the smaller set. */
385 if (tokens_a.length > tokens_b.length)
386 return this._do_match_tokens (tokens_a, tokens_b);
388 return this._do_match_tokens (tokens_b, tokens_a);
391 private bool _do_match_tokens (string[] bigger_set, string[] smaller_set)
393 for (var i=0; i < smaller_set.length; i++)
395 for (var j=0; j < bigger_set.length; j++)
397 if (smaller_set[i] == bigger_set[j])
405 private MatchResult _inc_match_level (
406 MatchResult current_level, int times = 1)
408 MatchResult ret = current_level + times;
409 if (ret > MatchResult.MAX)
410 ret = MatchResult.MAX;
415 private bool _look_alike_or_identical (string? a, string? b, out bool exact)
418 if (a == null || a == "" || b == null || b == "")
429 // a and b look alike if their Jaro distance is over the threshold.
430 return (this.jaro_dist ((!) a, (!) b) >= this._DIST_THRESHOLD);
433 private bool _look_alike (string? a, string? b)
435 if (a == null || a == "" || b == null || b == "")
440 // a and b look alike if their Jaro distance is over the threshold.
441 return (this.jaro_dist ((!) a, (!) b) >= this._DIST_THRESHOLD);
445 * http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance
447 * d = 1/3 * ( m/|s1| + m/|s2| + (m - t)/m )
451 * m = matching characters
452 * t = number of transpositions
454 private double jaro_dist (string s1, string s2)
457 int max = s1.length > s2.length ? s1.length : s2.length;
458 int max_dist = (max / 2) - 1;
460 double m = (double) this._matches (s1, s2, max_dist, out t);
461 double len_s1 = (double) s1.length;
462 double len_s2 = (double) s2.length;
463 double a = m / len_s1;
464 double b = m / len_s2;
470 distance = (1.0/3.0) * (a + b + c);
472 debug ("[jaro_dist] Distance for %s and %s: %f\n", s1, s2, distance);
477 /* Calculate matches and transpositions as defined by the Jaro distance.
479 private int _matches (string s1, string s2, int max_dist, out double t)
484 for (int i=0; i < s1.length; i++)
486 var look_for = s1.slice (i, i + 1);
487 int contains = this._contains (s2, look_for, i, max_dist);
496 debug ("%s and %s have %d matches and %f / 2 transpositions\n",
503 /* If haystack contains c in pos return 0, if it contains
504 * it withing the bounds of max_dist return abs(pos-pos_found).
505 * If its not found, return -1. */
506 private int _contains (string haystack, string c, int pos, int max_dist)
508 if (pos < haystack.length && haystack.slice (pos, pos + 1) == c)
511 for (int i=pos-max_dist; i <= pos + max_dist; i++)
513 if (i < 0 || i >= haystack.length)
516 var str = haystack.slice (i, i + 1);
518 return (pos - i).abs ();