001    /*
002    Copyright (c) 1996-2011, Damon Hart-Davis
003    All rights reserved.
004    
005    Redistribution and use in source and binary forms, with or without
006    modification, are permitted provided that the following conditions are
007    met:
008    
009      * Redistributions of source code must retain the above copyright
010        notice, this list of conditions and the following disclaimer.
011    
012      * Redistributions in binary form must reproduce the above copyright
013        notice, this list of conditions and the following disclaimer in the
014        documentation and/or other materials provided with the
015        distribution.
016    
017    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
018    IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
019    TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
020    PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
021    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
022    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
023    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
024    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
025    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
026    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
027    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
028    */
029    
030    package org.hd.d.pg2k.clApp.offline;
031    
032    import java.io.BufferedReader;
033    import java.io.EOFException;
034    import java.io.File;
035    import java.io.FileReader;
036    import java.io.FileWriter;
037    import java.io.IOException;
038    import java.io.PrintWriter;
039    import java.net.InetAddress;
040    import java.net.UnknownHostException;
041    import java.util.ArrayList;
042    import java.util.Arrays;
043    import java.util.Collections;
044    import java.util.HashMap;
045    import java.util.List;
046    import java.util.Map;
047    import java.util.Set;
048    import java.util.SortedMap;
049    import java.util.SortedSet;
050    import java.util.StringTokenizer;
051    import java.util.TreeMap;
052    import java.util.TreeSet;
053    
054    import org.hd.d.pg2k.svrCore.AddrTools;
055    import org.hd.d.pg2k.svrCore.MemoryTools;
056    import org.hd.d.pg2k.svrCore.location.GeoProximity;
057    import org.hd.d.pg2k.svrCore.location.GeoUtils;
058    
059    /**
060     * Created by IntelliJ IDEA.
061     * User: DHD
062     * Date: 01-May-2006
063     * Time: 12:02:55
064     */
065    
066    /**Off-line utility to create properties file for IPv4 to ccTLD mapping.
067     */
068    public class MakeCCTLDFromIPPrefixProperties
069        {
070        /**Prevent instances from being constructed. */
071        private MakeCCTLDFromIPPrefixProperties() { }
072    
073        /**Input data format name for MySQL-dump download from www.hostIP.info (circa 2006/04). */
074        public static final String INPUT_FORMAT_HOSTIP = "hostIPSQL";
075    
076        /**All allowed input formats; immutable and never null. */
077        public static final SortedSet<String> INPUT_FORMATS =
078            Collections.unmodifiableSortedSet(new TreeSet<String>(Arrays.asList(new String[]{
079                INPUT_FORMAT_HOSTIP,
080                })));
081    
082        /**Threshold (out of 256) at which we will discard minority values when doing lossy cc-prefix-table compression; strictly positive.
083         * If all the values in a given block share the same value except a small minority,
084         * and the count of minority items is less than or equal to this value
085         * then we pretend that the exception did not exist so we can trim down the tree at this point.
086         * <p>
087         * This is computed so that given the rough expected maximum cost of a worst-possible routing error
088         * (paying transit and getting ropey connectivity rather than a fast free local connection)
089         * we will not on average pay more than about twice the optimum routing costs
090         * because of our lossy encoding.
091         * This assumes that we'd never do much better than "country" level routing.
092         */
093        private static final int UNCONDITIONAL_PREFIX_LOSSY_COMPRESS_THRESHOLD = 1 +
094                256/GeoProximity.COUNTRY.getCloseness();
095    
096        /**Threshold (out of 256) at which we will lump countries into a region when doing lossy cc-prefix-table compression; strictly positive.
097         * Provided that all but this number of entries in a given block
098         * are in the same region as the dominant ccTLD in that block
099         * then during lossy compression we can replace them with that dominant ccTLD.
100         * <p>
101         * The cost of this is assumed to be mainly
102         * routing cost for the wrong country within a region when claiming the dominant ccTLD.
103         * (We currently neglect the possible cost of completely-wrong routing out-of-region.)
104         */
105        private static final int REGION_PREFIX_LOSSY_COMPRESS_THRESHOLD = 1 +
106                256/GeoProximity.COUNTRYGROUP.getCloseness();
107    
108        /**Threshold (out of 256) at which we will lump countries/regions into a continent when doing lossy cc-prefix-table compression; strictly positive.
109         * Provided that all but this number of entries in a given block
110         * are in the same region as the dominant ccTLD or region in that block
111         * then during lossy compression we can replace them with that dominant ccTLD/region.
112         * <p>
113         * The cost of this is assumed to be mainly
114         * routing cost for the wrong country within a continent when claiming the dominant ccTLD/region.
115         * (We currently neglect the possible cost of completely-wrong routing out-of-contient.)
116         */
117        private static final int CONTINENT_PREFIX_LOSSY_COMPRESS_THRESHOLD = 1 +
118                256/GeoProximity.CONTINENT.getCloseness();
119    
120    
121        /**Dump an IP-prefix map as-is to the given Writer.
122         * This does no "optimisation" or other transformation.
123         * <p>
124         * The output is printed in sorted order by the address-prefix key.
125         * <p>
126         * The output should be directly suitable to form all or part of a
127         * ccTLDFromIPPrefix.properties set.
128         */
129        public static final void dumpPrefixMap(final SortedMap<AddrTools.AddrPrefix,String> map, final PrintWriter pw)
130            {
131            for(final AddrTools.AddrPrefix ap : map.keySet())
132                { pw.println(ap.toPaddedDottedPrefix() + ' ' + map.get(ap)); }
133            }
134    
135        /**Perform (lossy) compression on an IP-prefix-to-ccTLD map.
136         * This may use lossy compression of the loaded values
137         * where the saving is large and the implied increase in routing/performance cost is low.
138         * <p>
139         * We never delete "" entries to avoid semantic changes,
140         * nor top-level (1-octet) entries for clarity.
141         */
142        private static SortedMap<AddrTools.AddrPrefix,String> lossyCompressCcTLDFromIPPrefixMap(final SortedMap<AddrTools.AddrPrefix,String> mapIn)
143            {
144            // We take a working copy of the map.
145            final SortedMap<AddrTools.AddrPrefix,String> work = new TreeMap<AddrTools.AddrPrefix, String>(mapIn);
146    
147            // We find the length of the longest prefix in the map.
148            int longestPrefix = 0;
149            for(final AddrTools.AddrPrefix prefix : work.keySet())
150                { if(prefix.length() > longestPrefix) { longestPrefix = prefix.length(); } }
151    
152            // We work backwards in length,
153            // trying to trim away large numbers of leaves that carry little information...
154            // We never try to remove 1-octect entries.
155            for(int i = longestPrefix; i > 1; --i)
156                { _lossyTrimPrefixesAtLength(work, i); }
157    
158            return(work);
159            }
160    
161        /**Try to trim away leaves at the given prefix length.
162         * We never remove "" entries.
163         * <p>
164         * We won't remove entries at the stated length
165         * that have longer sub-entries.
166         * This implies that trimming should be performed from the longest prefixes
167         * down to the shortest.
168         * <p>
169         * The map is updated in place.
170         */
171        private static void _lossyTrimPrefixesAtLength(final SortedMap<AddrTools.AddrPrefix,String> map,
172                                                       final int lengthToTrim)
173            {
174            if((map == null) || (lengthToTrim < 1))
175                { throw new IllegalArgumentException(); }
176    
177    System.out.println("_lossyTrimPrefixesAtLength(): STARTING trim at length: " + lengthToTrim);
178    
179            // First we copy all the keys into a List for easier manipulation.
180            // We assume that this is always in sorted order.
181            final AddrTools.AddrPrefix keys[] = map.keySet().toArray(new AddrTools.AddrPrefix[map.size()]);
182    
183            // We will search for entries of the stated length
184            // working on blocks of those that differ only in their final byte
185            // and refusing to trim if there are longer sub-entries.
186            nextBlock: for(int i = 0; i < keys.length; ++i)
187                {
188                final AddrTools.AddrPrefix ap = keys[i];
189                if(ap.length() != lengthToTrim) { continue; }
190    
191                // Make the prefix that is one octet shorter.
192                final AddrTools.AddrPrefix prefix = new AddrTools.AddrPrefix(ap, ap.length() - 1);
193    
194                // Note if we find longer sub-entries.
195                boolean foundLongerSubEntries = false;
196    
197                // Find how big this block is, and possibly trim it.
198                // Remember all non-"" entries, ie that we might zap.
199                final List<AddrTools.AddrPrefix> potentiallyZappable = new ArrayList<AddrTools.AddrPrefix>();
200                final int blockStart = i;
201                int blockEnd = blockStart;
202                for(int j = blockStart; j < keys.length; ++j)
203                    {
204                    final AddrTools.AddrPrefix keyInBlock = keys[j];
205                    if(!keyInBlock.isStrictPrefix(prefix))
206                        {
207                        // Left the block of candidate entries.
208                        break;
209                        }
210    
211                    // If not a "" entry, then it is potentially zappable.
212                    final boolean isEmptyString = "".equals(map.get(keyInBlock));
213                    if(!isEmptyString)
214                        { potentiallyZappable.add(keyInBlock); }
215    
216                    // Still part of the block...
217                    blockEnd = j;
218    
219                    // We will not be able to trim if we find longer sub-entries
220                    // other than "" entries which we always keep.
221                    if((keyInBlock.length() > lengthToTrim) && !isEmptyString)
222                        { foundLongerSubEntries = true; }
223                    }
224    
225                // Whether we trim or not,
226                // the outer loop can start no skip this block
227                // (note that it will get incremented at the loop re-initialisation).
228                i = blockEnd;
229    
230    //            final int blockLength = blockEnd - blockStart + 1;
231    
232    //System.out.println("_lossyTrimPrefixesAtLength(): found candidate block to trim: " + prefix + ".*; blockLength=" + blockLength +
233    //                   ", potentiallyZappable.size()=" + potentiallyZappable.size() +
234    //                   (!foundLongerSubEntries ? "" : " (untrimmable because of longer non-\"\" sub-entries)"));
235    
236                // If we found longer sub-entries then we can't trim this block,
237                // so move to the next one.
238                if(foundLongerSubEntries)
239                    { continue nextBlock; /* Start on next block... */ }
240    
241                // Collect counts of different values at this level of the tree.
242                // Note that we do this by trying to look up every possible value
243                // within this block, even for those values not explicitly present,
244                // to correctly account for propagation upwards of more general entries.
245                final Map<String,Integer> valueCounts = new HashMap<String, Integer>();
246                // Count of null values, ie where the lookup could not find anything.
247                int nullCount = 0;
248    
249                // Do IPv4 address lookup,
250                // so use a 4-byte address (leaving trailing octets at zero).
251                final byte addr[] = new byte[4];
252                // Copy in the correct prefix...
253                final byte rawPrefix[] = prefix.toByteArray();
254                for(int j = rawPrefix.length; --j >= 0; ) { addr[j] = rawPrefix[j]; }
255    
256                try
257                    {
258                    for(int b = 256; --b >= 0; )
259                        {
260                        // Adjust the variable byte for this block.
261                        addr[rawPrefix.length] = (byte) b;
262    
263                        // Create an IPv4 InetAddress.
264                        final InetAddress synthInetAddress = InetAddress.getByAddress(addr);
265                        final String v = GeoUtils.lookupAddrInIPToCcTLDMap(synthInetAddress, map, ap.length());
266    
267                        // Count all successful lookups.
268                        if(v != null)
269                            {
270                            final Integer count = valueCounts.get(v);
271                            if(count == null) { valueCounts.put(v, 1); }
272                            else { valueCounts.put(v, 1 + count); }
273                            }
274                        else { ++nullCount; }
275                        }
276                    }
277                catch(final UnknownHostException e) // Should never happen.
278                    {
279                    e.printStackTrace();
280                    throw new Error("Internal error");
281                    }
282    
283    //System.out.println("_lossyTrimPrefixesAtLength(): block="+prefix+".*, nulls="+nullCount+", valueCounts=" + valueCounts);
284    
285                // OK, decide if we can compress this block.
286                // If we *do*, then we remove all non-"" nodes from the block
287                // and write store at the prefix level the chosen representative value
288                // (if that is null, it means that we remove the prefix node too).
289    
290                // Simplest case:
291                // One overwhelming majority value, whatever it is,
292                // if in a huge majority, is used to replace this whole block.
293                final int overwhelmThreshold = 256 - UNCONDITIONAL_PREFIX_LOSSY_COMPRESS_THRESHOLD;
294                assert(overwhelmThreshold > 128); // Must be a clear majority to avoid ambiguity...
295    
296                // If we know almost nothing about this block
297                // then we simply remove the noise within it.
298                if(nullCount >= overwhelmThreshold)
299                    {
300                    // Zap all the (non-"") nodes from this block and possibly the prefix node.
301    System.out.println("_lossyTrimPrefixesAtLength(): removing block below "+prefix+" (and parent node if > 1 octet); potentiallyZappable.size()=" + potentiallyZappable.size() + ", valueCounts="+valueCounts);
302                    if(prefix.length() > 1) { map.remove(prefix); }
303                    map.keySet().removeAll(potentiallyZappable);
304                    continue nextBlock; // Start on next block...
305                    }
306    
307    //            // If we have too many unknowns in a block to be confortable about
308    //            // making any specific routing judgement for the whole block
309    //            // then give up now.
310    //            if(nullCount > UNCONDITIONAL_PREFIX_LOSSY_COMPRESS_THRESHOLD)
311    //                { continue nextBlock; /* Start on next block... */ }
312    
313                // Find the value with the highest count...
314                String highestCountValue = null;
315                int highestCount = 0;
316                for(final String v : valueCounts.keySet())
317                    {
318                    final int c = valueCounts.get(v);
319                    if(c > highestCount)
320                        {
321                        highestCount = c;
322                        highestCountValue = v;
323                        }
324                    }
325                assert((highestCountValue != null) && (highestCount > 0));
326    //System.out.println("_lossyTrimPrefixesAtLength(): dominant="+ highestCountValue + ", count="+highestCount);
327    
328                // If there is one overwhelmingly common entry in a block
329                // then rewrite the block to that one value.
330                if(highestCount >= overwhelmThreshold)
331                    {
332                    // Zap all the (non-"") nodes from this block and replace the prefix node.
333    System.out.println("_lossyTrimPrefixesAtLength(): replacing block with new parent node: `" + prefix + ' ' + highestCountValue + "'; potentiallyZappable.size()=" + potentiallyZappable.size() + ", valueCounts="+valueCounts);
334                    map.put(prefix, highestCountValue);
335                    map.keySet().removeAll(potentiallyZappable);
336                    continue nextBlock; // Start on next block...
337                    }
338    
339                final int regionThreshold = 256 - REGION_PREFIX_LOSSY_COMPRESS_THRESHOLD;
340    
341                // If there is one very common ccTLD in a block
342                // and all but a very few other entries are countries in the same region,
343                // then replace with the dominant ccTLD.
344                if(GeoUtils.CCTLD.isSyntaticallyValidCcTLD(highestCountValue) &&
345                        (highestCount >= regionThreshold))
346                    {
347                    // Check that enough other entries are countries in the same region.
348                    int outOfRegion = nullCount; // Include the nulls in the not-same-region list.
349                    final Set<GeoUtils.CCTLD> closeCcTLDs = GeoUtils.getCloseCCTLDs(new GeoUtils.CCTLD(highestCountValue));
350                    for(final String v : valueCounts.keySet())
351                        {
352                        if(v.equals(highestCountValue)) { continue; }
353    
354                        if(!GeoUtils.CCTLD.isSyntaticallyValidCcTLD(v) || !closeCcTLDs.contains(new GeoUtils.CCTLD(v)))
355                            {
356                            // Bad luck; this isn't a close country...
357                            outOfRegion += valueCounts.get(v);
358    //System.out.println("_lossyTrimPrefixesAtLength(): outOfRegion="+outOfRegion+" after "+v +" dominant="+highestCountValue);
359                            }
360                        }
361                    if(outOfRegion <= REGION_PREFIX_LOSSY_COMPRESS_THRESHOLD)
362                        {
363                        // Zap all the (non-"") nodes from this block and replace the prefix node.
364    System.out.println("_lossyTrimPrefixesAtLength(): replacing block with new parent node: `" + prefix + ' ' + highestCountValue + "'; potentiallyZappable.size()=" + potentiallyZappable.size() + ", valueCounts="+valueCounts);
365                        map.put(prefix, highestCountValue);
366                        map.keySet().removeAll(potentiallyZappable);
367                        continue nextBlock; // Start on next block...
368                        }
369                    }
370    
371    
372                final int continentThreshold = 256 - CONTINENT_PREFIX_LOSSY_COMPRESS_THRESHOLD;
373    
374                // If there is one very common ccTLD or region/registry/continent in a block
375                // and all but a very few other entries are in the same region,
376                // then replace with the dominant ccTLD/region.
377                if((GeoUtils.CCTLD.isSyntaticallyValidCcTLD(highestCountValue) ||
378                        GeoUtils.isSyntaticallyValidRegistryName(highestCountValue)) &&
379                            (highestCount >= continentThreshold))
380                    {
381                    // Check that enough other entries are countries in the same region.
382                    int outOfContinent = nullCount; // Include the nulls in the not-same-continent list.
383                    final Set<GeoUtils.CCTLD> closeCcTLDs = GeoUtils.CCTLD.isSyntaticallyValidCcTLD(highestCountValue) ?
384                                                            GeoUtils.getCloseCCTLDs(new GeoUtils.CCTLD(highestCountValue)) :
385                                                            GeoUtils.getCountriesInRegion(highestCountValue);
386                    for(final String v : valueCounts.keySet())
387                        {
388                        if(v.equals(highestCountValue)) { continue; }
389    
390                        // Make sure that this is a country close to the dominant country,
391                        // OR that this is a country in the dominant region,
392                        // else it is "out of area".
393                        if((!GeoUtils.CCTLD.isSyntaticallyValidCcTLD(v) || !closeCcTLDs.contains(new GeoUtils.CCTLD(v))) &&
394                                (!GeoUtils.isSyntaticallyValidRegistryName(v) ||
395                                !GeoUtils.CCTLD.isSyntaticallyValidCcTLD(highestCountValue) ||
396                                !GeoUtils.getCountriesInRegion(v).contains(new GeoUtils.CCTLD(highestCountValue))))
397                            {
398                            // Bad luck; this isn't a close country/region...
399                            outOfContinent += valueCounts.get(v);
400    //System.out.println("_lossyTrimPrefixesAtLength(): outOfContinent="+outOfContinent+" after "+v +" dominant="+highestCountValue);
401                            }
402                        }
403                    if(outOfContinent <= CONTINENT_PREFIX_LOSSY_COMPRESS_THRESHOLD)
404                        {
405                        // Zap all the (non-"") nodes from this block and replace the prefix node.
406    System.out.println("_lossyTrimPrefixesAtLength(): replacing block with new parent node: `" + prefix + ' ' + highestCountValue + "'; potentiallyZappable.size()=" + potentiallyZappable.size() + ", valueCounts="+valueCounts);
407                        map.put(prefix, highestCountValue);
408                        map.keySet().removeAll(potentiallyZappable);
409                        continue nextBlock; // Start on next block...
410                        }
411                    }
412    
413    
414    
415    
416    
417    
418    
419    
420                // TODO: more subtle cases...
421    
422    
423    
424    
425    
426    
427                // If no lossy compression scheme has applied
428                // and if there are no nulls at this level
429                // then attempt some simple lossless compression to save some space.
430                // Set the node above to the most common value,
431                // and remove all instances of that value from the block.
432                // Only worth doing when we will remove at least two sub-nodes.
433                if((nullCount == 0) && (highestCount > 1))
434                    {
435                    // Zap all the (non-"") nodes from this block and replace the prefix node.
436    System.out.println("_lossyTrimPrefixesAtLength(): removing dups at this level having added new parent node: `" + prefix + ' ' + highestCountValue + "', valueCounts="+valueCounts);
437                    map.put(prefix, highestCountValue);
438                    for(final AddrTools.AddrPrefix pzap : potentiallyZappable)
439                        {
440                        if(highestCountValue.equals(map.get(pzap)))
441                            { map.remove(pzap); }
442                        }
443                    continue nextBlock; // Start on next block...
444                    }
445                }
446            }
447    
448        /**Load external IP/location data into internal-style table; never null.
449         * This data can potentially be in one of several formats,
450         * each of which will require some massaging to get into our model.
451         * <p>
452         * Only input data records corresponding to ccTLDs explicitly listed in
453         * the "geo-proximity" data, ie whose getCloseCCTLDs() result is non-empty,
454         * will be retained.
455         * <p>
456         * We construct and return an unsorted map for speed.
457         *
458         * @param inputData  the (readable) input data file; never null
459         * @param inputFormat  the format (one of INPUT_FORMATS); never null
460         *
461         * @return input data in our format, for "interesting" ccTLDs
462         */
463        private static Map<AddrTools.AddrPrefix, String> loadData(final File inputData,
464                                                                  final String inputFormat)
465            throws IOException
466            {
467            final Map<AddrTools.AddrPrefix, String> result = new HashMap<AddrTools.AddrPrefix, String>();
468    
469            if(INPUT_FORMAT_HOSTIP.equals(inputFormat))
470                {
471                // Parse MySQL dump format!
472    
473                // Get a buffered handle on the country table data.
474                // This will be several DB rows as one or more lines
475                // near the start of the table...
476                final BufferedReader r1 = new BufferedReader(new FileReader(inputData));
477                // Prepare map of countries that we are interested in...
478                final Map<Integer, GeoUtils.CCTLD> countryNumberToCCTLD = new HashMap<Integer, GeoUtils.CCTLD>();
479                String countriesRecord;
480                try
481                    {
482                    while((countriesRecord = getNextRecordStarting(r1, "INSERT INTO countries ")) != null)
483                        {
484                        // Parse the countries data.
485                        // Be forgiving of records that we can't parse easily,
486                        // since there is some free text in country names.
487                        //
488                        // Expected row format (3 fields):
489                        //   * Internal country number (int)
490                        //   * Country name (String)
491                        //   * Upper-case CCTLD (String)
492                        final String[][] countriesData =
493                            parseINSERTRecord(countriesRecord, 3, true);
494    
495                        for(int i = 0; i < countriesData.length; ++i)
496                            {
497                            final String c = countriesData[i][2].toLowerCase();
498                            if(!GeoUtils.CCTLD.isSyntaticallyValidCcTLD(c)) { continue; }
499                            final GeoUtils.CCTLD ccTLD = new GeoUtils.CCTLD(c);
500                            if(GeoUtils.getCloseCCTLDs(ccTLD).isEmpty()) { continue; }
501                            // Interesting country, so file it...
502    System.out.println(" Keeping data on country: " + Arrays.asList(countriesData[i]));
503                            countryNumberToCCTLD.put(Integer.valueOf(countriesData[i][0]),
504                                                     MemoryTools.intern(new GeoUtils.CCTLD(MemoryTools.intern(c))));
505                            }
506    
507                        }
508                    }
509                finally { r1.close(); }
510    
511                if(countryNumberToCCTLD.isEmpty())
512                    { throw new EOFException("Could not find `countries` table/data."); }
513    
514                // Now parse the main data records...
515                // Get a new buffered handle on the country table data.
516                final BufferedReader r2 = new BufferedReader(new FileReader(inputData));
517                try
518                    {
519                    // Get the IP records in whatever order they are available...
520                    String ipRecord;
521                    final String prefix = "INSERT INTO ip4_";
522                    final int prefixLen = prefix.length();
523    //                int rowNum = 0;
524                    while((ipRecord = getNextRecordStarting(r2, prefix)) != null)
525                        {
526                        // Extract first byte of IP address (part of table name).
527                        final int q = ipRecord.indexOf(' ', prefixLen);
528                        final int byte1 = Integer.parseInt(ipRecord.substring(prefixLen, q), 10);
529    //System.out.println("Parsing entry "+(++rowNum)+" for addresses "+byte1+".X.X...  (Entries so far: "+result.size()+"...)");
530    
531                        final byte addrPrefix[] = { (byte) byte1, 0, 0 };
532    
533                        // Now find and store all the entries
534                        // for countries that we are interested in.
535                        //
536                        // Expected row format (5 fields):
537                        //   * Second byte of IPv4 address (int)
538                        //   * Third byte of IPv4 address (int)
539                        //   * Internal country number (int)
540                        //   * Internal city number (int)
541                        //   * Time/date of record.
542                        final String[][] addrData =
543                            parseINSERTRecord(ipRecord, 5, false);
544                        for(int i = addrData.length; --i >= 0; )
545                            {
546                            final String[] row = addrData[i];
547                            final GeoUtils.CCTLD cctld = countryNumberToCCTLD.get(Integer.valueOf(row[2]));
548                            if(cctld == null) { continue; }
549    
550                            addrPrefix[1] = (byte) Integer.parseInt(row[0], 10);
551                            addrPrefix[2] = (byte) Integer.parseInt(row[1], 10);
552                            result.put(new AddrTools.AddrPrefix(addrPrefix),
553                                       cctld.code);
554                            }
555                        }
556    
557    System.out.println("Finished reading data from: " + inputData);
558                    }
559                finally { r2.close(); }
560                }
561            else
562                { throw new IllegalArgumentException("unrecognised input format"); }
563    
564            return(result);
565            }
566    
567        /**Parse a MySQL insert record from a data dump.
568         * This parses a MySQL record of the format:
569         * <pre>
570    INSERT INTO `<i>table name</i>` VALUES (<i>field1,field2,...</i>),<i>...</i>(<i>...</i>);
571         * </pre>
572         *
573         * @param record  the full line; never null
574         * @param expectedFields  the expected number of expectedFields per record; non-negative
575         * @param ignoreBadRecords  silently skip bad records that we cannot parse
576         *     (for example because they contain our separator characters)
577         *
578         * @return zero or more rows each of the specified number of expectedFields;
579         *     no null records or fields
580         */
581        private static String[][] parseINSERTRecord(final String record,
582                                                    final int expectedFields,
583                                                    final boolean ignoreBadRecords)
584            {
585            if((record == null) || (expectedFields < 0))
586                { throw new IllegalArgumentException(); }
587    
588            if(!record.startsWith("INSERT INTO ") || (!record.endsWith(";")))
589                { throw new IllegalArgumentException("malformatted line start/end"); }
590    
591            final int valuesStart = record.indexOf(" VALUES ");
592            if(valuesStart == -1)
593                { throw new IllegalArgumentException("malformatted line VALUES"); }
594    
595            final String core = record.substring(valuesStart + 8, record.length()-1).trim();
596    //System.out.println("core: " + core);
597    
598            // Chop into table rows (when multiple rows are on one line)...
599            final StringTokenizer st = new StringTokenizer(core, ")(");
600            final List<String[]> allRows = new ArrayList<String[]>(1+(st.countTokens()/2));
601            for(int rowNum = 0; st.hasMoreTokens(); ++rowNum)
602                {
603                final String row = st.nextToken();
604                if(",".equals(row)) { continue; /* Row separator. */ }
605                final StringTokenizer stRow = new StringTokenizer(row, ",");
606    //System.out.println("ROW: `"+row+"'.");
607                final int nFields = stRow.countTokens();
608                if(nFields != expectedFields)
609                    {
610                    if(ignoreBadRecords) { continue; }
611                    throw new IllegalArgumentException("Wrong number of fields (expected "+expectedFields+", got "+nFields+") in record "+rowNum+": " + row);
612                    }
613                final String rowResult[] = new String[nFields];
614                for(int j = 0; j < nFields; ++j)
615                    {
616                    String s = stRow.nextToken();
617                    // Strip any surrounding quotes...
618                    if((s.length() > 2) && s.startsWith("'") && s.endsWith("'"))
619                        { s = s.substring(1, s.length()-1); }
620                    rowResult[j] = s;
621                    }
622    //System.out.println("PARSED ROW: " + Arrays.asList(rowResult));
623                allRows.add(rowResult);
624                }
625    
626            final String[][] result = new String[allRows.size()][];
627            allRows.toArray(result);
628            return(result);
629            }
630    
631        /**Returns the next line/record starting with the specified String, else null at EOF. */
632        private static String getNextRecordStarting(final BufferedReader r,
633                                                    final String lineStart)
634            throws IOException
635            {
636            String record;
637            while((record = r.readLine()) != null)
638                {
639                if(record.startsWith(lineStart))
640                    { return(record); }
641                }
642            return(null);
643            }
644    
645        /**An entry point to load the prefix map and write it out in a more compact format.
646         * This may use lossy compression of the loaded values
647         * where the saving is large
648         * and the implied increase in routing/performance cost is low.
649         * <p>
650         * The new map is dumped to the file named as the first argument.
651         * <p>
652         * If a source of new input data is supplied as an optional second argument,
653         * then it is merged into the existing data in memory,
654         * and the dump will be the lossily-compressed result.
655         * Only new input data corresponding to ccTLDs explicitly listed in
656         * the "geo-proximity" data, ie whose getCloseCCTLDs() result is non-empty,
657         * will be retained.
658         * This input data file may need to be re-read in multiple passes.
659         */
660        public static final void main(final String args[])
661            {
662            if((args.length != 1) && (args.length != 3))
663                {
664                System.err.println("Usage: main filenameToDumpTo [newDataFileName FORMAT]");
665                System.err.println("  Allowed values of FORMAT are: " + new ArrayList<String>(INPUT_FORMATS));
666                Runtime.getRuntime().exit(1);
667                return;
668                }
669    
670            final String filename = args[0];
671    
672            System.out.println("Will dump result to file: " + filename);
673    
674            final SortedMap<AddrTools.AddrPrefix,String> ccTLDFromIPPrefix = GeoUtils.getCcTLDFromIPPrefix();
675            System.out.println("Loaded static map size: " + ccTLDFromIPPrefix.size());
676    //        System.out.println("Loaded static map maximum key length: " + GeoUtils.ccTLDFromIPPrefixLongestKey);
677    
678            // The current static data...
679            SortedMap<AddrTools.AddrPrefix,String> ccTLDFromIPPrefixWorking = ccTLDFromIPPrefix;
680    
681            // If new data has been supplied,
682            // then load it now,
683            // overriding extant static data in case of conflict.
684            if(args.length == 3)
685                {
686                final File inputData = new File(args[1]);
687                final String inputFormat = args[2];
688    
689                if(!INPUT_FORMATS.contains(inputFormat))
690                    {
691                    System.err.println("I do not understand input format: " + inputFormat);
692                    System.err.println("Valid formats are: " + new ArrayList<String>(INPUT_FORMATS));
693                    System.exit(1);
694                    }
695    
696                if(!inputData.exists() || !inputData.canRead() || !inputData.isFile())
697                    {
698                    System.err.println("Cannot open input file: " + inputData);
699                    System.exit(1);
700                    }
701    
702                try
703                    {
704                    // Copy existing static data...
705                    ccTLDFromIPPrefixWorking = new TreeMap<AddrTools.AddrPrefix, String>(ccTLDFromIPPrefix);
706                    // Override with new data...
707                    ccTLDFromIPPrefixWorking.putAll(loadData(inputData, inputFormat));
708                    }
709                catch(final IOException e)
710                    {
711                    e.printStackTrace();
712                    System.err.println("Problem reading input data file: " + inputData + ": " + e.getMessage());
713                    System.exit(1);
714                    }
715                catch(final Exception e)
716                    {
717                    e.printStackTrace();
718                    System.err.println("Problem with input data file: " + inputData + ": " + e.getMessage());
719                    System.exit(1);
720                    }
721                }
722    
723            // Now transform/compress the map.
724            System.out.println("Starting lossy compression on map of size: " + ccTLDFromIPPrefixWorking.size());
725            final SortedMap<AddrTools.AddrPrefix,String> c1 = lossyCompressCcTLDFromIPPrefixMap(ccTLDFromIPPrefixWorking);
726            System.out.println("Map size after lossy compression: " + c1.size());
727            System.out.println("Entries removed: " + (ccTLDFromIPPrefixWorking.size() - c1.size()));
728    
729            // Now dump the new map...
730            try
731                {
732                final PrintWriter pw = new PrintWriter(new FileWriter(filename));
733                dumpPrefixMap(c1, pw);
734                pw.flush();
735                pw.close();
736                }
737            catch(final IOException e)
738                {
739                System.err.println("FAILED to write output file with IOException");
740                e.printStackTrace();
741                Runtime.getRuntime().exit(1);
742                return;
743                }
744            }
745        }