001 /*
002 Copyright (c) 1996-2011, Damon Hart-Davis
003 All rights reserved.
004
005 Redistribution and use in source and binary forms, with or without
006 modification, are permitted provided that the following conditions are
007 met:
008
009 * Redistributions of source code must retain the above copyright
010 notice, this list of conditions and the following disclaimer.
011
012 * Redistributions in binary form must reproduce the above copyright
013 notice, this list of conditions and the following disclaimer in the
014 documentation and/or other materials provided with the
015 distribution.
016
017 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
018 IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
019 TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
020 PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
021 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
022 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
023 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
024 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
025 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
026 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
027 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
028 */
029
030 package org.hd.d.pg2k.clApp.offline;
031
032 import java.io.BufferedReader;
033 import java.io.EOFException;
034 import java.io.File;
035 import java.io.FileReader;
036 import java.io.FileWriter;
037 import java.io.IOException;
038 import java.io.PrintWriter;
039 import java.net.InetAddress;
040 import java.net.UnknownHostException;
041 import java.util.ArrayList;
042 import java.util.Arrays;
043 import java.util.Collections;
044 import java.util.HashMap;
045 import java.util.List;
046 import java.util.Map;
047 import java.util.Set;
048 import java.util.SortedMap;
049 import java.util.SortedSet;
050 import java.util.StringTokenizer;
051 import java.util.TreeMap;
052 import java.util.TreeSet;
053
054 import org.hd.d.pg2k.svrCore.AddrTools;
055 import org.hd.d.pg2k.svrCore.MemoryTools;
056 import org.hd.d.pg2k.svrCore.location.GeoProximity;
057 import org.hd.d.pg2k.svrCore.location.GeoUtils;
058
059 /**
060 * Created by IntelliJ IDEA.
061 * User: DHD
062 * Date: 01-May-2006
063 * Time: 12:02:55
064 */
065
066 /**Off-line utility to create properties file for IPv4 to ccTLD mapping.
067 */
068 public class MakeCCTLDFromIPPrefixProperties
069 {
070 /**Prevent instances from being constructed. */
071 private MakeCCTLDFromIPPrefixProperties() { }
072
073 /**Input data format name for MySQL-dump download from www.hostIP.info (circa 2006/04). */
074 public static final String INPUT_FORMAT_HOSTIP = "hostIPSQL";
075
076 /**All allowed input formats; immutable and never null. */
077 public static final SortedSet<String> INPUT_FORMATS =
078 Collections.unmodifiableSortedSet(new TreeSet<String>(Arrays.asList(new String[]{
079 INPUT_FORMAT_HOSTIP,
080 })));
081
082 /**Threshold (out of 256) at which we will discard minority values when doing lossy cc-prefix-table compression; strictly positive.
083 * If all the values in a given block share the same value except a small minority,
084 * and the count of minority items is less than or equal to this value
085 * then we pretend that the exception did not exist so we can trim down the tree at this point.
086 * <p>
087 * This is computed so that given the rough expected maximum cost of a worst-possible routing error
088 * (paying transit and getting ropey connectivity rather than a fast free local connection)
089 * we will not on average pay more than about twice the optimum routing costs
090 * because of our lossy encoding.
091 * This assumes that we'd never do much better than "country" level routing.
092 */
093 private static final int UNCONDITIONAL_PREFIX_LOSSY_COMPRESS_THRESHOLD = 1 +
094 256/GeoProximity.COUNTRY.getCloseness();
095
096 /**Threshold (out of 256) at which we will lump countries into a region when doing lossy cc-prefix-table compression; strictly positive.
097 * Provided that all but this number of entries in a given block
098 * are in the same region as the dominant ccTLD in that block
099 * then during lossy compression we can replace them with that dominant ccTLD.
100 * <p>
101 * The cost of this is assumed to be mainly
102 * routing cost for the wrong country within a region when claiming the dominant ccTLD.
103 * (We currently neglect the possible cost of completely-wrong routing out-of-region.)
104 */
105 private static final int REGION_PREFIX_LOSSY_COMPRESS_THRESHOLD = 1 +
106 256/GeoProximity.COUNTRYGROUP.getCloseness();
107
108 /**Threshold (out of 256) at which we will lump countries/regions into a continent when doing lossy cc-prefix-table compression; strictly positive.
109 * Provided that all but this number of entries in a given block
110 * are in the same region as the dominant ccTLD or region in that block
111 * then during lossy compression we can replace them with that dominant ccTLD/region.
112 * <p>
113 * The cost of this is assumed to be mainly
114 * routing cost for the wrong country within a continent when claiming the dominant ccTLD/region.
115 * (We currently neglect the possible cost of completely-wrong routing out-of-contient.)
116 */
117 private static final int CONTINENT_PREFIX_LOSSY_COMPRESS_THRESHOLD = 1 +
118 256/GeoProximity.CONTINENT.getCloseness();
119
120
121 /**Dump an IP-prefix map as-is to the given Writer.
122 * This does no "optimisation" or other transformation.
123 * <p>
124 * The output is printed in sorted order by the address-prefix key.
125 * <p>
126 * The output should be directly suitable to form all or part of a
127 * ccTLDFromIPPrefix.properties set.
128 */
129 public static final void dumpPrefixMap(final SortedMap<AddrTools.AddrPrefix,String> map, final PrintWriter pw)
130 {
131 for(final AddrTools.AddrPrefix ap : map.keySet())
132 { pw.println(ap.toPaddedDottedPrefix() + ' ' + map.get(ap)); }
133 }
134
135 /**Perform (lossy) compression on an IP-prefix-to-ccTLD map.
136 * This may use lossy compression of the loaded values
137 * where the saving is large and the implied increase in routing/performance cost is low.
138 * <p>
139 * We never delete "" entries to avoid semantic changes,
140 * nor top-level (1-octet) entries for clarity.
141 */
142 private static SortedMap<AddrTools.AddrPrefix,String> lossyCompressCcTLDFromIPPrefixMap(final SortedMap<AddrTools.AddrPrefix,String> mapIn)
143 {
144 // We take a working copy of the map.
145 final SortedMap<AddrTools.AddrPrefix,String> work = new TreeMap<AddrTools.AddrPrefix, String>(mapIn);
146
147 // We find the length of the longest prefix in the map.
148 int longestPrefix = 0;
149 for(final AddrTools.AddrPrefix prefix : work.keySet())
150 { if(prefix.length() > longestPrefix) { longestPrefix = prefix.length(); } }
151
152 // We work backwards in length,
153 // trying to trim away large numbers of leaves that carry little information...
154 // We never try to remove 1-octect entries.
155 for(int i = longestPrefix; i > 1; --i)
156 { _lossyTrimPrefixesAtLength(work, i); }
157
158 return(work);
159 }
160
161 /**Try to trim away leaves at the given prefix length.
162 * We never remove "" entries.
163 * <p>
164 * We won't remove entries at the stated length
165 * that have longer sub-entries.
166 * This implies that trimming should be performed from the longest prefixes
167 * down to the shortest.
168 * <p>
169 * The map is updated in place.
170 */
171 private static void _lossyTrimPrefixesAtLength(final SortedMap<AddrTools.AddrPrefix,String> map,
172 final int lengthToTrim)
173 {
174 if((map == null) || (lengthToTrim < 1))
175 { throw new IllegalArgumentException(); }
176
177 System.out.println("_lossyTrimPrefixesAtLength(): STARTING trim at length: " + lengthToTrim);
178
179 // First we copy all the keys into a List for easier manipulation.
180 // We assume that this is always in sorted order.
181 final AddrTools.AddrPrefix keys[] = map.keySet().toArray(new AddrTools.AddrPrefix[map.size()]);
182
183 // We will search for entries of the stated length
184 // working on blocks of those that differ only in their final byte
185 // and refusing to trim if there are longer sub-entries.
186 nextBlock: for(int i = 0; i < keys.length; ++i)
187 {
188 final AddrTools.AddrPrefix ap = keys[i];
189 if(ap.length() != lengthToTrim) { continue; }
190
191 // Make the prefix that is one octet shorter.
192 final AddrTools.AddrPrefix prefix = new AddrTools.AddrPrefix(ap, ap.length() - 1);
193
194 // Note if we find longer sub-entries.
195 boolean foundLongerSubEntries = false;
196
197 // Find how big this block is, and possibly trim it.
198 // Remember all non-"" entries, ie that we might zap.
199 final List<AddrTools.AddrPrefix> potentiallyZappable = new ArrayList<AddrTools.AddrPrefix>();
200 final int blockStart = i;
201 int blockEnd = blockStart;
202 for(int j = blockStart; j < keys.length; ++j)
203 {
204 final AddrTools.AddrPrefix keyInBlock = keys[j];
205 if(!keyInBlock.isStrictPrefix(prefix))
206 {
207 // Left the block of candidate entries.
208 break;
209 }
210
211 // If not a "" entry, then it is potentially zappable.
212 final boolean isEmptyString = "".equals(map.get(keyInBlock));
213 if(!isEmptyString)
214 { potentiallyZappable.add(keyInBlock); }
215
216 // Still part of the block...
217 blockEnd = j;
218
219 // We will not be able to trim if we find longer sub-entries
220 // other than "" entries which we always keep.
221 if((keyInBlock.length() > lengthToTrim) && !isEmptyString)
222 { foundLongerSubEntries = true; }
223 }
224
225 // Whether we trim or not,
226 // the outer loop can start no skip this block
227 // (note that it will get incremented at the loop re-initialisation).
228 i = blockEnd;
229
230 // final int blockLength = blockEnd - blockStart + 1;
231
232 //System.out.println("_lossyTrimPrefixesAtLength(): found candidate block to trim: " + prefix + ".*; blockLength=" + blockLength +
233 // ", potentiallyZappable.size()=" + potentiallyZappable.size() +
234 // (!foundLongerSubEntries ? "" : " (untrimmable because of longer non-\"\" sub-entries)"));
235
236 // If we found longer sub-entries then we can't trim this block,
237 // so move to the next one.
238 if(foundLongerSubEntries)
239 { continue nextBlock; /* Start on next block... */ }
240
241 // Collect counts of different values at this level of the tree.
242 // Note that we do this by trying to look up every possible value
243 // within this block, even for those values not explicitly present,
244 // to correctly account for propagation upwards of more general entries.
245 final Map<String,Integer> valueCounts = new HashMap<String, Integer>();
246 // Count of null values, ie where the lookup could not find anything.
247 int nullCount = 0;
248
249 // Do IPv4 address lookup,
250 // so use a 4-byte address (leaving trailing octets at zero).
251 final byte addr[] = new byte[4];
252 // Copy in the correct prefix...
253 final byte rawPrefix[] = prefix.toByteArray();
254 for(int j = rawPrefix.length; --j >= 0; ) { addr[j] = rawPrefix[j]; }
255
256 try
257 {
258 for(int b = 256; --b >= 0; )
259 {
260 // Adjust the variable byte for this block.
261 addr[rawPrefix.length] = (byte) b;
262
263 // Create an IPv4 InetAddress.
264 final InetAddress synthInetAddress = InetAddress.getByAddress(addr);
265 final String v = GeoUtils.lookupAddrInIPToCcTLDMap(synthInetAddress, map, ap.length());
266
267 // Count all successful lookups.
268 if(v != null)
269 {
270 final Integer count = valueCounts.get(v);
271 if(count == null) { valueCounts.put(v, 1); }
272 else { valueCounts.put(v, 1 + count); }
273 }
274 else { ++nullCount; }
275 }
276 }
277 catch(final UnknownHostException e) // Should never happen.
278 {
279 e.printStackTrace();
280 throw new Error("Internal error");
281 }
282
283 //System.out.println("_lossyTrimPrefixesAtLength(): block="+prefix+".*, nulls="+nullCount+", valueCounts=" + valueCounts);
284
285 // OK, decide if we can compress this block.
286 // If we *do*, then we remove all non-"" nodes from the block
287 // and write store at the prefix level the chosen representative value
288 // (if that is null, it means that we remove the prefix node too).
289
290 // Simplest case:
291 // One overwhelming majority value, whatever it is,
292 // if in a huge majority, is used to replace this whole block.
293 final int overwhelmThreshold = 256 - UNCONDITIONAL_PREFIX_LOSSY_COMPRESS_THRESHOLD;
294 assert(overwhelmThreshold > 128); // Must be a clear majority to avoid ambiguity...
295
296 // If we know almost nothing about this block
297 // then we simply remove the noise within it.
298 if(nullCount >= overwhelmThreshold)
299 {
300 // Zap all the (non-"") nodes from this block and possibly the prefix node.
301 System.out.println("_lossyTrimPrefixesAtLength(): removing block below "+prefix+" (and parent node if > 1 octet); potentiallyZappable.size()=" + potentiallyZappable.size() + ", valueCounts="+valueCounts);
302 if(prefix.length() > 1) { map.remove(prefix); }
303 map.keySet().removeAll(potentiallyZappable);
304 continue nextBlock; // Start on next block...
305 }
306
307 // // If we have too many unknowns in a block to be confortable about
308 // // making any specific routing judgement for the whole block
309 // // then give up now.
310 // if(nullCount > UNCONDITIONAL_PREFIX_LOSSY_COMPRESS_THRESHOLD)
311 // { continue nextBlock; /* Start on next block... */ }
312
313 // Find the value with the highest count...
314 String highestCountValue = null;
315 int highestCount = 0;
316 for(final String v : valueCounts.keySet())
317 {
318 final int c = valueCounts.get(v);
319 if(c > highestCount)
320 {
321 highestCount = c;
322 highestCountValue = v;
323 }
324 }
325 assert((highestCountValue != null) && (highestCount > 0));
326 //System.out.println("_lossyTrimPrefixesAtLength(): dominant="+ highestCountValue + ", count="+highestCount);
327
328 // If there is one overwhelmingly common entry in a block
329 // then rewrite the block to that one value.
330 if(highestCount >= overwhelmThreshold)
331 {
332 // Zap all the (non-"") nodes from this block and replace the prefix node.
333 System.out.println("_lossyTrimPrefixesAtLength(): replacing block with new parent node: `" + prefix + ' ' + highestCountValue + "'; potentiallyZappable.size()=" + potentiallyZappable.size() + ", valueCounts="+valueCounts);
334 map.put(prefix, highestCountValue);
335 map.keySet().removeAll(potentiallyZappable);
336 continue nextBlock; // Start on next block...
337 }
338
339 final int regionThreshold = 256 - REGION_PREFIX_LOSSY_COMPRESS_THRESHOLD;
340
341 // If there is one very common ccTLD in a block
342 // and all but a very few other entries are countries in the same region,
343 // then replace with the dominant ccTLD.
344 if(GeoUtils.CCTLD.isSyntaticallyValidCcTLD(highestCountValue) &&
345 (highestCount >= regionThreshold))
346 {
347 // Check that enough other entries are countries in the same region.
348 int outOfRegion = nullCount; // Include the nulls in the not-same-region list.
349 final Set<GeoUtils.CCTLD> closeCcTLDs = GeoUtils.getCloseCCTLDs(new GeoUtils.CCTLD(highestCountValue));
350 for(final String v : valueCounts.keySet())
351 {
352 if(v.equals(highestCountValue)) { continue; }
353
354 if(!GeoUtils.CCTLD.isSyntaticallyValidCcTLD(v) || !closeCcTLDs.contains(new GeoUtils.CCTLD(v)))
355 {
356 // Bad luck; this isn't a close country...
357 outOfRegion += valueCounts.get(v);
358 //System.out.println("_lossyTrimPrefixesAtLength(): outOfRegion="+outOfRegion+" after "+v +" dominant="+highestCountValue);
359 }
360 }
361 if(outOfRegion <= REGION_PREFIX_LOSSY_COMPRESS_THRESHOLD)
362 {
363 // Zap all the (non-"") nodes from this block and replace the prefix node.
364 System.out.println("_lossyTrimPrefixesAtLength(): replacing block with new parent node: `" + prefix + ' ' + highestCountValue + "'; potentiallyZappable.size()=" + potentiallyZappable.size() + ", valueCounts="+valueCounts);
365 map.put(prefix, highestCountValue);
366 map.keySet().removeAll(potentiallyZappable);
367 continue nextBlock; // Start on next block...
368 }
369 }
370
371
372 final int continentThreshold = 256 - CONTINENT_PREFIX_LOSSY_COMPRESS_THRESHOLD;
373
374 // If there is one very common ccTLD or region/registry/continent in a block
375 // and all but a very few other entries are in the same region,
376 // then replace with the dominant ccTLD/region.
377 if((GeoUtils.CCTLD.isSyntaticallyValidCcTLD(highestCountValue) ||
378 GeoUtils.isSyntaticallyValidRegistryName(highestCountValue)) &&
379 (highestCount >= continentThreshold))
380 {
381 // Check that enough other entries are countries in the same region.
382 int outOfContinent = nullCount; // Include the nulls in the not-same-continent list.
383 final Set<GeoUtils.CCTLD> closeCcTLDs = GeoUtils.CCTLD.isSyntaticallyValidCcTLD(highestCountValue) ?
384 GeoUtils.getCloseCCTLDs(new GeoUtils.CCTLD(highestCountValue)) :
385 GeoUtils.getCountriesInRegion(highestCountValue);
386 for(final String v : valueCounts.keySet())
387 {
388 if(v.equals(highestCountValue)) { continue; }
389
390 // Make sure that this is a country close to the dominant country,
391 // OR that this is a country in the dominant region,
392 // else it is "out of area".
393 if((!GeoUtils.CCTLD.isSyntaticallyValidCcTLD(v) || !closeCcTLDs.contains(new GeoUtils.CCTLD(v))) &&
394 (!GeoUtils.isSyntaticallyValidRegistryName(v) ||
395 !GeoUtils.CCTLD.isSyntaticallyValidCcTLD(highestCountValue) ||
396 !GeoUtils.getCountriesInRegion(v).contains(new GeoUtils.CCTLD(highestCountValue))))
397 {
398 // Bad luck; this isn't a close country/region...
399 outOfContinent += valueCounts.get(v);
400 //System.out.println("_lossyTrimPrefixesAtLength(): outOfContinent="+outOfContinent+" after "+v +" dominant="+highestCountValue);
401 }
402 }
403 if(outOfContinent <= CONTINENT_PREFIX_LOSSY_COMPRESS_THRESHOLD)
404 {
405 // Zap all the (non-"") nodes from this block and replace the prefix node.
406 System.out.println("_lossyTrimPrefixesAtLength(): replacing block with new parent node: `" + prefix + ' ' + highestCountValue + "'; potentiallyZappable.size()=" + potentiallyZappable.size() + ", valueCounts="+valueCounts);
407 map.put(prefix, highestCountValue);
408 map.keySet().removeAll(potentiallyZappable);
409 continue nextBlock; // Start on next block...
410 }
411 }
412
413
414
415
416
417
418
419
420 // TODO: more subtle cases...
421
422
423
424
425
426
427 // If no lossy compression scheme has applied
428 // and if there are no nulls at this level
429 // then attempt some simple lossless compression to save some space.
430 // Set the node above to the most common value,
431 // and remove all instances of that value from the block.
432 // Only worth doing when we will remove at least two sub-nodes.
433 if((nullCount == 0) && (highestCount > 1))
434 {
435 // Zap all the (non-"") nodes from this block and replace the prefix node.
436 System.out.println("_lossyTrimPrefixesAtLength(): removing dups at this level having added new parent node: `" + prefix + ' ' + highestCountValue + "', valueCounts="+valueCounts);
437 map.put(prefix, highestCountValue);
438 for(final AddrTools.AddrPrefix pzap : potentiallyZappable)
439 {
440 if(highestCountValue.equals(map.get(pzap)))
441 { map.remove(pzap); }
442 }
443 continue nextBlock; // Start on next block...
444 }
445 }
446 }
447
448 /**Load external IP/location data into internal-style table; never null.
449 * This data can potentially be in one of several formats,
450 * each of which will require some massaging to get into our model.
451 * <p>
452 * Only input data records corresponding to ccTLDs explicitly listed in
453 * the "geo-proximity" data, ie whose getCloseCCTLDs() result is non-empty,
454 * will be retained.
455 * <p>
456 * We construct and return an unsorted map for speed.
457 *
458 * @param inputData the (readable) input data file; never null
459 * @param inputFormat the format (one of INPUT_FORMATS); never null
460 *
461 * @return input data in our format, for "interesting" ccTLDs
462 */
463 private static Map<AddrTools.AddrPrefix, String> loadData(final File inputData,
464 final String inputFormat)
465 throws IOException
466 {
467 final Map<AddrTools.AddrPrefix, String> result = new HashMap<AddrTools.AddrPrefix, String>();
468
469 if(INPUT_FORMAT_HOSTIP.equals(inputFormat))
470 {
471 // Parse MySQL dump format!
472
473 // Get a buffered handle on the country table data.
474 // This will be several DB rows as one or more lines
475 // near the start of the table...
476 final BufferedReader r1 = new BufferedReader(new FileReader(inputData));
477 // Prepare map of countries that we are interested in...
478 final Map<Integer, GeoUtils.CCTLD> countryNumberToCCTLD = new HashMap<Integer, GeoUtils.CCTLD>();
479 String countriesRecord;
480 try
481 {
482 while((countriesRecord = getNextRecordStarting(r1, "INSERT INTO countries ")) != null)
483 {
484 // Parse the countries data.
485 // Be forgiving of records that we can't parse easily,
486 // since there is some free text in country names.
487 //
488 // Expected row format (3 fields):
489 // * Internal country number (int)
490 // * Country name (String)
491 // * Upper-case CCTLD (String)
492 final String[][] countriesData =
493 parseINSERTRecord(countriesRecord, 3, true);
494
495 for(int i = 0; i < countriesData.length; ++i)
496 {
497 final String c = countriesData[i][2].toLowerCase();
498 if(!GeoUtils.CCTLD.isSyntaticallyValidCcTLD(c)) { continue; }
499 final GeoUtils.CCTLD ccTLD = new GeoUtils.CCTLD(c);
500 if(GeoUtils.getCloseCCTLDs(ccTLD).isEmpty()) { continue; }
501 // Interesting country, so file it...
502 System.out.println(" Keeping data on country: " + Arrays.asList(countriesData[i]));
503 countryNumberToCCTLD.put(Integer.valueOf(countriesData[i][0]),
504 MemoryTools.intern(new GeoUtils.CCTLD(MemoryTools.intern(c))));
505 }
506
507 }
508 }
509 finally { r1.close(); }
510
511 if(countryNumberToCCTLD.isEmpty())
512 { throw new EOFException("Could not find `countries` table/data."); }
513
514 // Now parse the main data records...
515 // Get a new buffered handle on the country table data.
516 final BufferedReader r2 = new BufferedReader(new FileReader(inputData));
517 try
518 {
519 // Get the IP records in whatever order they are available...
520 String ipRecord;
521 final String prefix = "INSERT INTO ip4_";
522 final int prefixLen = prefix.length();
523 // int rowNum = 0;
524 while((ipRecord = getNextRecordStarting(r2, prefix)) != null)
525 {
526 // Extract first byte of IP address (part of table name).
527 final int q = ipRecord.indexOf(' ', prefixLen);
528 final int byte1 = Integer.parseInt(ipRecord.substring(prefixLen, q), 10);
529 //System.out.println("Parsing entry "+(++rowNum)+" for addresses "+byte1+".X.X... (Entries so far: "+result.size()+"...)");
530
531 final byte addrPrefix[] = { (byte) byte1, 0, 0 };
532
533 // Now find and store all the entries
534 // for countries that we are interested in.
535 //
536 // Expected row format (5 fields):
537 // * Second byte of IPv4 address (int)
538 // * Third byte of IPv4 address (int)
539 // * Internal country number (int)
540 // * Internal city number (int)
541 // * Time/date of record.
542 final String[][] addrData =
543 parseINSERTRecord(ipRecord, 5, false);
544 for(int i = addrData.length; --i >= 0; )
545 {
546 final String[] row = addrData[i];
547 final GeoUtils.CCTLD cctld = countryNumberToCCTLD.get(Integer.valueOf(row[2]));
548 if(cctld == null) { continue; }
549
550 addrPrefix[1] = (byte) Integer.parseInt(row[0], 10);
551 addrPrefix[2] = (byte) Integer.parseInt(row[1], 10);
552 result.put(new AddrTools.AddrPrefix(addrPrefix),
553 cctld.code);
554 }
555 }
556
557 System.out.println("Finished reading data from: " + inputData);
558 }
559 finally { r2.close(); }
560 }
561 else
562 { throw new IllegalArgumentException("unrecognised input format"); }
563
564 return(result);
565 }
566
567 /**Parse a MySQL insert record from a data dump.
568 * This parses a MySQL record of the format:
569 * <pre>
570 INSERT INTO `<i>table name</i>` VALUES (<i>field1,field2,...</i>),<i>...</i>(<i>...</i>);
571 * </pre>
572 *
573 * @param record the full line; never null
574 * @param expectedFields the expected number of expectedFields per record; non-negative
575 * @param ignoreBadRecords silently skip bad records that we cannot parse
576 * (for example because they contain our separator characters)
577 *
578 * @return zero or more rows each of the specified number of expectedFields;
579 * no null records or fields
580 */
581 private static String[][] parseINSERTRecord(final String record,
582 final int expectedFields,
583 final boolean ignoreBadRecords)
584 {
585 if((record == null) || (expectedFields < 0))
586 { throw new IllegalArgumentException(); }
587
588 if(!record.startsWith("INSERT INTO ") || (!record.endsWith(";")))
589 { throw new IllegalArgumentException("malformatted line start/end"); }
590
591 final int valuesStart = record.indexOf(" VALUES ");
592 if(valuesStart == -1)
593 { throw new IllegalArgumentException("malformatted line VALUES"); }
594
595 final String core = record.substring(valuesStart + 8, record.length()-1).trim();
596 //System.out.println("core: " + core);
597
598 // Chop into table rows (when multiple rows are on one line)...
599 final StringTokenizer st = new StringTokenizer(core, ")(");
600 final List<String[]> allRows = new ArrayList<String[]>(1+(st.countTokens()/2));
601 for(int rowNum = 0; st.hasMoreTokens(); ++rowNum)
602 {
603 final String row = st.nextToken();
604 if(",".equals(row)) { continue; /* Row separator. */ }
605 final StringTokenizer stRow = new StringTokenizer(row, ",");
606 //System.out.println("ROW: `"+row+"'.");
607 final int nFields = stRow.countTokens();
608 if(nFields != expectedFields)
609 {
610 if(ignoreBadRecords) { continue; }
611 throw new IllegalArgumentException("Wrong number of fields (expected "+expectedFields+", got "+nFields+") in record "+rowNum+": " + row);
612 }
613 final String rowResult[] = new String[nFields];
614 for(int j = 0; j < nFields; ++j)
615 {
616 String s = stRow.nextToken();
617 // Strip any surrounding quotes...
618 if((s.length() > 2) && s.startsWith("'") && s.endsWith("'"))
619 { s = s.substring(1, s.length()-1); }
620 rowResult[j] = s;
621 }
622 //System.out.println("PARSED ROW: " + Arrays.asList(rowResult));
623 allRows.add(rowResult);
624 }
625
626 final String[][] result = new String[allRows.size()][];
627 allRows.toArray(result);
628 return(result);
629 }
630
631 /**Returns the next line/record starting with the specified String, else null at EOF. */
632 private static String getNextRecordStarting(final BufferedReader r,
633 final String lineStart)
634 throws IOException
635 {
636 String record;
637 while((record = r.readLine()) != null)
638 {
639 if(record.startsWith(lineStart))
640 { return(record); }
641 }
642 return(null);
643 }
644
645 /**An entry point to load the prefix map and write it out in a more compact format.
646 * This may use lossy compression of the loaded values
647 * where the saving is large
648 * and the implied increase in routing/performance cost is low.
649 * <p>
650 * The new map is dumped to the file named as the first argument.
651 * <p>
652 * If a source of new input data is supplied as an optional second argument,
653 * then it is merged into the existing data in memory,
654 * and the dump will be the lossily-compressed result.
655 * Only new input data corresponding to ccTLDs explicitly listed in
656 * the "geo-proximity" data, ie whose getCloseCCTLDs() result is non-empty,
657 * will be retained.
658 * This input data file may need to be re-read in multiple passes.
659 */
660 public static final void main(final String args[])
661 {
662 if((args.length != 1) && (args.length != 3))
663 {
664 System.err.println("Usage: main filenameToDumpTo [newDataFileName FORMAT]");
665 System.err.println(" Allowed values of FORMAT are: " + new ArrayList<String>(INPUT_FORMATS));
666 Runtime.getRuntime().exit(1);
667 return;
668 }
669
670 final String filename = args[0];
671
672 System.out.println("Will dump result to file: " + filename);
673
674 final SortedMap<AddrTools.AddrPrefix,String> ccTLDFromIPPrefix = GeoUtils.getCcTLDFromIPPrefix();
675 System.out.println("Loaded static map size: " + ccTLDFromIPPrefix.size());
676 // System.out.println("Loaded static map maximum key length: " + GeoUtils.ccTLDFromIPPrefixLongestKey);
677
678 // The current static data...
679 SortedMap<AddrTools.AddrPrefix,String> ccTLDFromIPPrefixWorking = ccTLDFromIPPrefix;
680
681 // If new data has been supplied,
682 // then load it now,
683 // overriding extant static data in case of conflict.
684 if(args.length == 3)
685 {
686 final File inputData = new File(args[1]);
687 final String inputFormat = args[2];
688
689 if(!INPUT_FORMATS.contains(inputFormat))
690 {
691 System.err.println("I do not understand input format: " + inputFormat);
692 System.err.println("Valid formats are: " + new ArrayList<String>(INPUT_FORMATS));
693 System.exit(1);
694 }
695
696 if(!inputData.exists() || !inputData.canRead() || !inputData.isFile())
697 {
698 System.err.println("Cannot open input file: " + inputData);
699 System.exit(1);
700 }
701
702 try
703 {
704 // Copy existing static data...
705 ccTLDFromIPPrefixWorking = new TreeMap<AddrTools.AddrPrefix, String>(ccTLDFromIPPrefix);
706 // Override with new data...
707 ccTLDFromIPPrefixWorking.putAll(loadData(inputData, inputFormat));
708 }
709 catch(final IOException e)
710 {
711 e.printStackTrace();
712 System.err.println("Problem reading input data file: " + inputData + ": " + e.getMessage());
713 System.exit(1);
714 }
715 catch(final Exception e)
716 {
717 e.printStackTrace();
718 System.err.println("Problem with input data file: " + inputData + ": " + e.getMessage());
719 System.exit(1);
720 }
721 }
722
723 // Now transform/compress the map.
724 System.out.println("Starting lossy compression on map of size: " + ccTLDFromIPPrefixWorking.size());
725 final SortedMap<AddrTools.AddrPrefix,String> c1 = lossyCompressCcTLDFromIPPrefixMap(ccTLDFromIPPrefixWorking);
726 System.out.println("Map size after lossy compression: " + c1.size());
727 System.out.println("Entries removed: " + (ccTLDFromIPPrefixWorking.size() - c1.size()));
728
729 // Now dump the new map...
730 try
731 {
732 final PrintWriter pw = new PrintWriter(new FileWriter(filename));
733 dumpPrefixMap(c1, pw);
734 pw.flush();
735 pw.close();
736 }
737 catch(final IOException e)
738 {
739 System.err.println("FAILED to write output file with IOException");
740 e.printStackTrace();
741 Runtime.getRuntime().exit(1);
742 return;
743 }
744 }
745 }