From: philip.liard@gmail.com Date: Thu, 18 Aug 2011 09:12:21 +0000 (+0000) Subject: TOOLS: Add Java tool to combine the geocoding data. X-Git-Tag: upstream/5.3.2~202 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=8c257ca5844fd161a1f51997ac9cd899d952d28a;p=platform%2Fupstream%2Flibphonenumber.git TOOLS: Add Java tool to combine the geocoding data. git-svn-id: http://libphonenumber.googlecode.com/svn/trunk@337 ee073f10-1060-11df-b6a4-87a95322a99c --- diff --git a/tools/java/data/pom.xml b/tools/java/data/pom.xml new file mode 100644 index 0000000..d669c1a --- /dev/null +++ b/tools/java/data/pom.xml @@ -0,0 +1,56 @@ + + + 4.0.0 + + tools + com.google.i18n.phonenumbers + 1.0-SNAPSHOT + + com.google.i18n.phonenumbers + data-tools + 1.0-SNAPSHOT + Libphonenumber Data tools + + + + junit + junit + 4.8.1 + test + + + + + src + test + + + org.apache.maven.plugins + maven-assembly-plugin + 2.2.1 + + + jar-with-dependencies + + + + true + com.google.i18n.phonenumbers.tools.CombineGeoData + + + + + + make-assembly + package + + single + + + + + + + + diff --git a/tools/java/data/src/com/google/i18n/phonenumbers/tools/CombineGeoData.java b/tools/java/data/src/com/google/i18n/phonenumbers/tools/CombineGeoData.java new file mode 100644 index 0000000..ed03281 --- /dev/null +++ b/tools/java/data/src/com/google/i18n/phonenumbers/tools/CombineGeoData.java @@ -0,0 +1,280 @@ +/* + * Copyright (C) 2011 The Libphonenumber Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.i18n.phonenumbers.tools; + +import java.io.BufferedOutputStream; +import java.io.BufferedReader; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.OutputStream; +import java.io.PrintWriter; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.SortedMap; +import java.util.TreeMap; +import java.util.logging.Logger; + +/** + * Utility class that makes the geocoding data as small as possible. This class assumes the + * geocoding data provided as input doesn't contain any gap thus should not be used with incomplete + * data (missing prefixes). + *
+ * Example:        Can be combined as:
+ *   33131|Paris     331|Paris
+ *   33132|Paris     334|Marseille
+ *   3341|Marseille
+ * 
+ * + * @author Philippe Liard + */ +public class CombineGeoData { + private final InputStream inputStream; + private final OutputStream outputStream; + private static final Logger LOGGER = Logger.getLogger(CombineGeoData.class.getName()); + + public CombineGeoData(InputStream inputStream, OutputStream outputStream) { + this.inputStream = inputStream; + this.outputStream = outputStream; + } + + /** + * Utility class that contains two indexes (start and end). + */ + static class Range { + public final int start; + public final int end; + + public Range(int start, int end) { + this.start = start; + this.end = end; + } + } + + /** + * Parses the input text file expected to contain lines written as 'prefix|description'. + * + * @return the map of phone prefix data parsed. + * @throws IOException + */ + private SortedMap parseInput() throws IOException { + SortedMap outputMap = new TreeMap(); + BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream)); + + for (String line; (line = bufferedReader.readLine()) != null; ) { + if (!line.matches("\\d+|.+")) { + continue; + } + int indexOfPipe = line.indexOf('|'); + outputMap.put(line.substring(0, indexOfPipe), line.substring(indexOfPipe + 1)); + } + return outputMap; + } + + /** + * Creates a sorted array of phone number prefixes as strings from the provided phone number + * prefix map. + * + * @return the array of phone number prefixes sorted by string. + */ + static String[] createSortedPrefixArray(SortedMap phonePrefixMap) { + String[] sortedPrefixes = new String[phonePrefixMap.size()]; + phonePrefixMap.keySet().toArray(sortedPrefixes); + return sortedPrefixes; + } + + /** + * Finds the end index of the range of phone number prefixes starting at the provided index. + * A range ends when a different description or prefix divided by 10 is encountered. + * + * @param prefixes the array of phone number prefixes sorted by string + * @param phonePrefixMap the map associating phone number prefixes and descriptions + * @param start the start index of the prefixes array + * @return the index of the end of the range starting at the provided index + */ + static int findRangeEnd(String[] prefixes, Map phonePrefixMap, int start) { + String previousPrefix = prefixes[start]; + int previousPrefixAsInt = Integer.parseInt(previousPrefix); + String previousLocation = phonePrefixMap.get(previousPrefix); + + for (int i = start; i < prefixes.length; i++) { + String currentPrefix = prefixes[i]; + String currentLocation = phonePrefixMap.get(currentPrefix); + if (!currentLocation.equals(previousLocation) || + (Integer.parseInt(currentPrefix) / 10 != previousPrefixAsInt / 10)) { + return i - 1; + } + } + return prefixes.length - 1; + } + + /** + * Splits the provided array of prefixes into an array of ranges. A range contains the start and + * end indexes of a set of mappings that share the same description and have the same prefix minus + * the last digit. + * + * @param prefixes the array of phone number prefixes sorted by string + * @param phonePrefixMap the map associating phone number prefixes and descriptions + * @return the list of ranges + */ + static List createRanges(String[] prefixes, Map phonePrefixMap) { + List ranges = new ArrayList(); + int index = 0; + int phonePrefixMapSize = phonePrefixMap.size(); + + while (index < phonePrefixMapSize) { + int rangeEnd = findRangeEnd(prefixes, phonePrefixMap, index); + ranges.add(new Range(index, rangeEnd)); + index = rangeEnd + 1; + } + return ranges; + } + + /** + * Checks whether the provided candidate prefix conflicts with the prefixes contained in the + * provided range. A conflict occurs if the provided prefix covers (is a prefix of) one of the + * prefixes contained in the range. + * + * @param prefixes the array of phone number prefixes sorted by string + * @param candidate the candidate phone number prefix + * @param start the start of the range + * @param end the end of the range + * @return whether the candidate prefix conflicts with the prefixes contained in the range + */ + static boolean findConflict(String[] prefixes, int candidate, int start, int end) { + String candidateAsString = String.valueOf(candidate); + for (int i = start; i <= end; i++) { + String prefix = prefixes[i]; + if (prefix.startsWith(candidateAsString)) { + return true; + } + } + return false; + } + + /** + * Checks whether the provided candidate prefix conflicts with the prefixes contained in the + * ranges adjacent (before and after the provided range) of the provided range. + * + * @param ranges the list of ranges + * @param rangeIndex the index of the range in which the conflict search occurs + * @param prefixes the array of phone number prefixes sorted by string + * @param candidate the candidate phone number prefix + * @return whether a conflict was found in the provided range + */ + static boolean hasConflict(List ranges, int rangeIndex, String[] prefixes, int candidate) { + if (rangeIndex > 0) { + Range previousRange = ranges.get(rangeIndex - 1); + if (findConflict(prefixes, candidate, previousRange.start, previousRange.end)) { + return true; + } + } + if (rangeIndex < ranges.size() - 1) { + Range nextRange = ranges.get(rangeIndex + 1); + if (findConflict(prefixes, candidate, nextRange.start, nextRange.end)) { + return true; + } + } + return false; + } + + /** + * Combines the mappings contained in the provided map. A new combined map is returned as a result + * in case any combination occurred. Otherwise (if no combination occurred) the same map (same + * identity) is returned. Note that this method performs a 'single step' (i.e performs only one + * combination iteration). + */ + static SortedMap combine(SortedMap phonePrefixMap) { + String[] prefixes = createSortedPrefixArray(phonePrefixMap); + List ranges = createRanges(prefixes, phonePrefixMap); + Map combinedPrefixes = new HashMap(); + int rangeIndex = 0; + + for (Range range : ranges) { + int prefixCandidate = Integer.parseInt(prefixes[range.start]) / 10; + if (prefixCandidate != 0 && !hasConflict(ranges, rangeIndex, prefixes, prefixCandidate)) { + combinedPrefixes.put(rangeIndex, prefixCandidate); + } + ++rangeIndex; + } + if (combinedPrefixes.size() == 0) { + return phonePrefixMap; + } + SortedMap combinedMap = new TreeMap(); + rangeIndex = 0; + for (Range range : ranges) { + Integer combinedRange = combinedPrefixes.get(rangeIndex++); + if (combinedRange != null) { + String firstPrefixOfRange = prefixes[range.start]; + combinedMap.put(String.valueOf(combinedRange), phonePrefixMap.get(firstPrefixOfRange)); + } else { + for (int i = range.start; i <= range.end; i++) { + String prefix = prefixes[i]; + combinedMap.put(prefix, phonePrefixMap.get(prefix)); + } + } + } + return combinedMap; + } + + /** + * Combines the provided map associating phone number prefixes and descriptions. + * + * @return the combined map + */ + static SortedMap combineMultipleTimes(SortedMap phonePrefixMap) { + SortedMap previousMap = null; + while (phonePrefixMap != previousMap) { + previousMap = phonePrefixMap; + phonePrefixMap = combine(phonePrefixMap); + } + return phonePrefixMap; + } + + /** + * Combines the geocoding data read from the provided input stream and writes it as a result to + * the provided output stream. + */ + public void run() throws IOException { + SortedMap phonePrefixMap = parseInput(); + phonePrefixMap = combineMultipleTimes(phonePrefixMap); + PrintWriter printWriter = new PrintWriter(new BufferedOutputStream(outputStream)); + for (Map.Entry mapping : phonePrefixMap.entrySet()) { + printWriter.printf("%s|%s\n", mapping.getKey(), mapping.getValue()); + } + printWriter.flush(); + } + + public static void main(String[] args) { + if (args.length != 2) { + LOGGER.severe("usage: java -jar combine-geodata.jar /path/to/input /path/to/output"); + System.exit(1); + } + try { + CombineGeoData combineGeoData = + new CombineGeoData(new FileInputStream(args[0]), new FileOutputStream(args[1])); + combineGeoData.run(); + } catch (Exception e) { + LOGGER.severe(e.getMessage()); + System.exit(1); + } + } +} diff --git a/tools/java/data/test/com/google/i18n/phonenumbers/tools/CombineGeoDataTest.java b/tools/java/data/test/com/google/i18n/phonenumbers/tools/CombineGeoDataTest.java new file mode 100644 index 0000000..c6b1fea --- /dev/null +++ b/tools/java/data/test/com/google/i18n/phonenumbers/tools/CombineGeoDataTest.java @@ -0,0 +1,233 @@ +/* + * Copyright (C) 2011 The Libphonenumber Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.i18n.phonenumbers.tools; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import com.google.i18n.phonenumbers.tools.CombineGeoData.Range; + +import org.junit.Test; + +import java.util.List; +import java.util.SortedMap; +import java.util.TreeMap; + +/** + * Unit tests for CombineGeoData class. + * + * @author Philippe Liard + */ +public class CombineGeoDataTest { + @Test + public void createSortedPrefixArray() { + SortedMap phonePrefixMap = new TreeMap(); + phonePrefixMap.put("122", null); + phonePrefixMap.put("42", null); + phonePrefixMap.put("4012", null); + phonePrefixMap.put("1000", null); + + String[] sortedPrefixes = CombineGeoData.createSortedPrefixArray(phonePrefixMap); + assertEquals("1000", sortedPrefixes[0]); + assertEquals("122", sortedPrefixes[1]); + assertEquals("4012", sortedPrefixes[2]); + assertEquals("42", sortedPrefixes[3]); + } + + @Test + public void findRangeEndFromStart() { + SortedMap phonePrefixMap = new TreeMap(); + phonePrefixMap.put("33130", "Paris"); + phonePrefixMap.put("33139", "Paris"); + phonePrefixMap.put("334", "Marseille"); + + String[] prefixes = CombineGeoData.createSortedPrefixArray(phonePrefixMap); + int rangeEnd = CombineGeoData.findRangeEnd(prefixes, phonePrefixMap, 0); + assertEquals(1, rangeEnd); + } + + @Test + public void findRangeEndFromMiddle() { + SortedMap phonePrefixMap = new TreeMap(); + phonePrefixMap.put("33130", "Paris"); + phonePrefixMap.put("33139", "Paris"); + phonePrefixMap.put("3341", "Marseille"); + phonePrefixMap.put("3342", "Marseille"); + + String[] prefixes = CombineGeoData.createSortedPrefixArray(phonePrefixMap); + int rangeEnd = CombineGeoData.findRangeEnd(prefixes, phonePrefixMap, 2); + assertEquals(3, rangeEnd); + } + + @Test + public void findRangeEndWithSameLocationButDifferentPrefix() { + SortedMap phonePrefixMap = new TreeMap(); + phonePrefixMap.put("33130", "Paris"); + phonePrefixMap.put("3314", "Paris"); + phonePrefixMap.put("3341", "Marseille"); + phonePrefixMap.put("3342", "Marseille"); + + String[] prefixes = CombineGeoData.createSortedPrefixArray(phonePrefixMap); + int rangeEnd = CombineGeoData.findRangeEnd(prefixes, phonePrefixMap, 0); + assertEquals(0, rangeEnd); + } + + @Test + public void createRanges() { + SortedMap phonePrefixMap = new TreeMap(); + phonePrefixMap.put("33120", "Paris"); + phonePrefixMap.put("33130", "Paris"); + phonePrefixMap.put("33139", "Paris"); + phonePrefixMap.put("3341", "Marseille"); + phonePrefixMap.put("3342", "Marseille"); + + String[] prefixes = CombineGeoData.createSortedPrefixArray(phonePrefixMap); + List ranges = CombineGeoData.createRanges(prefixes, phonePrefixMap); + assertEquals(3, ranges.size()); + assertEquals(0, ranges.get(0).start); + assertEquals(0, ranges.get(0).end); + assertEquals(1, ranges.get(1).start); + assertEquals(2, ranges.get(1).end); + assertEquals(3, ranges.get(2).start); + assertEquals(4, ranges.get(2).end); + } + + @Test + public void findConflict() { + SortedMap phonePrefixMap = new TreeMap(); + phonePrefixMap.put("33130", "Saint Germain en Laye"); + phonePrefixMap.put("33132", "Paris"); + phonePrefixMap.put("33139", "Paris"); + + String[] prefixes = CombineGeoData.createSortedPrefixArray(phonePrefixMap); + assertTrue(CombineGeoData.findConflict(prefixes, 3313, 0, 0)); + } + + @Test + public void conflictBefore() { + SortedMap phonePrefixMap = new TreeMap(); + phonePrefixMap.put("33130", "Saint Germain en Laye"); + phonePrefixMap.put("33132", "Paris"); + phonePrefixMap.put("33139", "Paris"); + phonePrefixMap.put("3341", "Marseille"); + phonePrefixMap.put("3342", "Marseille"); + + String[] prefixes = CombineGeoData.createSortedPrefixArray(phonePrefixMap); + List ranges = CombineGeoData.createRanges(prefixes, phonePrefixMap); + assertTrue(CombineGeoData.hasConflict(ranges, 1, prefixes, 3313)); + } + + @Test + public void conflictAfter() { + SortedMap phonePrefixMap = new TreeMap(); + phonePrefixMap.put("33122", "Poissy"); + phonePrefixMap.put("33132", "Paris"); + phonePrefixMap.put("33138", "Paris"); + phonePrefixMap.put("33139", "Saint Germain en Laye"); + phonePrefixMap.put("3341", "Marseille"); + phonePrefixMap.put("3342", "Marseille"); + + String[] prefixes = CombineGeoData.createSortedPrefixArray(phonePrefixMap); + List ranges = CombineGeoData.createRanges(prefixes, phonePrefixMap); + assertEquals(4, ranges.size()); + assertTrue(CombineGeoData.hasConflict(ranges, 1, prefixes, 3313)); + } + + @Test + public void noConflict() { + SortedMap phonePrefixMap = new TreeMap(); + phonePrefixMap.put("33122", "Poissy"); + phonePrefixMap.put("33132", "Paris"); + phonePrefixMap.put("33138", "Paris"); + phonePrefixMap.put("33149", "Saint Germain en Laye"); + phonePrefixMap.put("3341", "Marseille"); + phonePrefixMap.put("3342", "Marseille"); + + String[] prefixes = CombineGeoData.createSortedPrefixArray(phonePrefixMap); + List ranges = CombineGeoData.createRanges(prefixes, phonePrefixMap); + assertEquals(4, ranges.size()); + assertFalse(CombineGeoData.hasConflict(ranges, 1, prefixes, 3313)); + } + + @Test + public void combineRemovesLastDigit() { + SortedMap phonePrefixMap = new TreeMap(); + phonePrefixMap.put("33122", "Poissy"); + phonePrefixMap.put("33132", "Paris"); + phonePrefixMap.put("33149", "Saint Germain en Laye"); + phonePrefixMap.put("3342", "Marseille"); + + phonePrefixMap = CombineGeoData.combine(phonePrefixMap); + assertEquals(4, phonePrefixMap.size()); + assertEquals("Poissy", phonePrefixMap.get("3312")); + assertEquals("Paris", phonePrefixMap.get("3313")); + assertEquals("Saint Germain en Laye", phonePrefixMap.get("3314")); + assertEquals("Marseille", phonePrefixMap.get("334")); + } + + @Test + public void combineMergesSamePrefixAndLocation() { + SortedMap phonePrefixMap = new TreeMap(); + phonePrefixMap.put("33132", "Paris"); + phonePrefixMap.put("33133", "Paris"); + phonePrefixMap.put("33134", "Paris"); + + phonePrefixMap = CombineGeoData.combine(phonePrefixMap); + assertEquals(1, phonePrefixMap.size()); + assertEquals("Paris", phonePrefixMap.get("3313")); + } + + @Test + public void combineWithNoPossibleCombination() { + SortedMap phonePrefixMap = new TreeMap(); + phonePrefixMap.put("3312", "Poissy"); + phonePrefixMap.put("3313", "Paris"); + phonePrefixMap.put("3314", "Saint Germain en Laye"); + + phonePrefixMap = CombineGeoData.combine(phonePrefixMap); + assertEquals(3, phonePrefixMap.size()); + assertEquals("Poissy", phonePrefixMap.get("3312")); + assertEquals("Paris", phonePrefixMap.get("3313")); + assertEquals("Saint Germain en Laye", phonePrefixMap.get("3314")); + } + + @Test + public void combineMultipleTimes() { + SortedMap phonePrefixMap = new TreeMap(); + phonePrefixMap.put("33132", "Paris"); + phonePrefixMap.put("33133", "Paris"); + phonePrefixMap.put("33134", "Paris"); + + phonePrefixMap = CombineGeoData.combineMultipleTimes(phonePrefixMap); + assertEquals(1, phonePrefixMap.size()); + assertEquals("Paris", phonePrefixMap.get("3")); + } + + @Test + public void combineMultipleTimesWithPrefixesWithDifferentLengths() { + SortedMap phonePrefixMap = new TreeMap(); + phonePrefixMap.put("332", "Paris"); + phonePrefixMap.put("33133", "Paris"); + phonePrefixMap.put("41", "Marseille"); + + phonePrefixMap = CombineGeoData.combineMultipleTimes(phonePrefixMap); + assertEquals(2, phonePrefixMap.size()); + assertEquals("Paris", phonePrefixMap.get("3")); + assertEquals("Marseille", phonePrefixMap.get("4")); + } +} diff --git a/tools/java/pom.xml b/tools/java/pom.xml index 2165a2c..ddcfd59 100644 --- a/tools/java/pom.xml +++ b/tools/java/pom.xml @@ -22,6 +22,7 @@ common cpp-build + data java-build