001 /*
002 Copyright (c) 1996-2011, Damon Hart-Davis
003 All rights reserved.
004
005 Redistribution and use in source and binary forms, with or without
006 modification, are permitted provided that the following conditions are
007 met:
008
009 * Redistributions of source code must retain the above copyright
010 notice, this list of conditions and the following disclaimer.
011
012 * Redistributions in binary form must reproduce the above copyright
013 notice, this list of conditions and the following disclaimer in the
014 documentation and/or other materials provided with the
015 distribution.
016
017 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
018 IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
019 TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
020 PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
021 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
022 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
023 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
024 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
025 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
026 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
027 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
028 */
029
030 package org.hd.d.pg2k.ai.scorer;
031
032 import java.io.IOException;
033 import java.util.ArrayList;
034 import java.util.Collection;
035 import java.util.Collections;
036 import java.util.Comparator;
037 import java.util.HashMap;
038 import java.util.HashSet;
039 import java.util.Iterator;
040 import java.util.List;
041 import java.util.Map;
042 import java.util.NavigableSet;
043 import java.util.Set;
044 import java.util.SortedSet;
045 import java.util.TreeSet;
046 import java.util.concurrent.ConcurrentHashMap;
047 import java.util.concurrent.ConcurrentMap;
048 import java.util.concurrent.atomic.AtomicInteger;
049
050 import org.hd.d.pg2k.svrCore.GenUtils;
051 import org.hd.d.pg2k.svrCore.MemoryTools.SimpleLRUMapAutoSizeForHitRate;
052 import org.hd.d.pg2k.svrCore.SimpleLoggerIF;
053 import org.hd.d.pg2k.svrCore.Tuple.Pair;
054 import org.hd.d.pg2k.svrCore.datasource.SimpleExhibitPipelineIF;
055 import org.hd.d.pg2k.svrCore.vars.EventPeriod;
056 import org.hd.d.pg2k.svrCore.vars.SimpleVariableValue;
057 import org.hd.d.pg2k.svrCore.vars.SystemVariables;
058
059 import ORG.hd.d.IsDebug;
060
061
062 /**Thread-safe container for Scorer-and-parameter information to maintain an evolving population.
063 * This separates Scorers by basic (parameterless) name,
064 * on the grounds that they are not directly comparable.
065 * <p>
066 * This has limited capacity (so as to cap resource use),
067 * and discards the least good parameterised Scorers as needed
068 * to make way for new (better) instances.
069 * (These discarded less-good Scorers may have their results retained
070 * in a smalled secondary "negative" cache to speed up re-testing.)
071 * <p>
072 * This has the automatially persists/announces/shares new/good instances
073 * using the system variables events mechanism, local and global.
074 * <p>
075 * TODO: in case of accute memory shortage abandon least-good Scorers
076 */
077 public final class ScorerPopulation
078 {
079 /**Construct with a list of the base Scorers (names, no parameters) to keep track of.
080 * Note that we may slightly exceed the maximum specified population size for expediency,
081 * and we may dump some (less good) part of the population under memory stress.
082 *
083 * @param baseScorers non-null, non-empty, pure-base-name seed set
084 * @param maxPopulationSize maximum number of Scorer details to retain in memory,
085 * possibly excluding the fixed/base Scorers, strictly positive;
086 * this value is advisory and may be slightly exceeded for performance
087 * @param callback used to report new best-of-breed being inserted into cache;
088 * may be null
089 */
090 public ScorerPopulation(final Map<String, ScorerIF> baseScorers,
091 final NewBestCallbackIF callback,
092 final int maxPopulationSize)
093 {
094 this.callback = callback;
095
096 if(baseScorers == null) { throw new IllegalArgumentException(); }
097 // Store immutable (defensive) copy of base Scorer names/instances.
098 this.baseScorers = Collections.unmodifiableMap(new HashMap<String, ScorerIF>(baseScorers));
099 if(this.baseScorers.isEmpty()) { throw new IllegalArgumentException(); }
100 // Validate the Scorer names and instances.
101 for(final String n : this.baseScorers.keySet())
102 {
103 // Validate the name, and make sure that there are no parameters.
104 if(!AbstractScorer.parseNameAndParameters(n).second.isEmpty())
105 { throw new IllegalArgumentException("no parameters allowed on seed set"); }
106 // Ensure that the name corresponds to the actual Scorer instance supplied.
107 final ScorerIF scorer = baseScorers.get(n);
108 if((scorer == null) || !n.equals(scorer.getBaseName()))
109 { throw new IllegalArgumentException("mismatch between Scorer base name and instance"); }
110 if(!n.equals(scorer.getNameAndParameters()))
111 { throw new IllegalArgumentException("supplied Scorer instance was parameterised"); }
112 }
113 // Create the Set of valid classes for quick validation of instances.
114 supportedScorerClasses = new HashSet<Class<? extends ScorerIF>>(2 * this.baseScorers.size());
115 for(final ScorerIF sc : this.baseScorers.values())
116 { supportedScorerClasses.add(sc.getClass()); }
117
118 // Must be at least two slots per base Scorer type/name
119 // (to allow for the base/parameterless value and one 'optimal' value).
120 if(maxPopulationSize <= 2*this.baseScorers.size()) { throw new IllegalArgumentException("population too small"); }
121 this.maxPopulationSize = maxPopulationSize;
122
123 // Size to accommodate the specified capacity without further expansion.
124 // We don't expect much write concurrency.
125 // We use a lowish load-factor to enhance read and write performance.
126 _scorerScores = new ConcurrentHashMap<String, Pair<Long, ScoreAndConf>>(2*maxPopulationSize, 0.5f, 4);
127
128 // The comparator arranges for the items to be sorted by increasing "goodness".
129 // Because of a lack of symmetry in scoring
130 // (ie reversing the prediction of a bad Scorer does not necessarily make for a good prediction)
131 // we are most interested in the highest-confidence, positive-correlation Scorers.
132 SCORER_RANK_COMPARATOR = new Comparator<String>()
133 {
134 // This treats missing items from _scorerScores as neutral (goodness 0).
135 public final int compare(final String a, final String b)
136 {
137 final ScoreAndConf sacA = _scorerScores.get(a).second;
138 final ScoreAndConf sacB = _scorerScores.get(b).second;
139 // Primary sort by goodness.
140 final int goodnessA = (sacA == null) ? 0 : ScoreAndConf.computeScorerGoodness(sacA);
141 final int goodnessB = (sacB == null) ? 0 : ScoreAndConf.computeScorerGoodness(sacB);
142 if(goodnessA < goodnessB) { return(-1); }
143 if(goodnessA > goodnessB) { return(+1); }
144 if((sacA != null) && (sacB != null))
145 {
146 // Secondary sort by score (if both scores available).
147 if(sacA.score < sacB.score) { return(-1); }
148 if(sacA.score > sacB.score) { return(+1); }
149 }
150 // Break ties by name to ensure a total ordering.
151 return(a.compareTo(b));
152 }
153 };
154 // Create the (immutable) outer map with empty sub-maps for _scorerByGoodness.
155 final Map<String, NavigableSet<String>> sbg = new HashMap<String, NavigableSet<String>>(2 * baseScorers.size());
156 for(final String n : this.baseScorers.keySet())
157 { sbg.put(n, new TreeSet<String>(SCORER_RANK_COMPARATOR)); }
158 _scorerByGoodness = Collections.unmodifiableMap(sbg);
159
160 // Backup LRU cache, mainly for "bad" Scorer results.
161 // This also accommodates 'clones' evicted from the main population.
162 // This can be cleared entirely during extreme memory stress.
163 final int _backstopLRUMaxSize = 1 + maxPopulationSize/2;
164 _backstopLRU = SimpleLRUMapAutoSizeForHitRate.<String, Pair<Long, ScoreAndConf>>create(0, _backstopLRUMaxSize, "_backstopLRU");
165
166 // Create Scorer lookup cache, size limited, in proportion to max population size.
167 // This allows us to hold in cache every live Scorer in the main population.
168 // This assumes that each Scorer instance does not consume much memory/resource.
169 // Since we may well need to churn through the entire population sequentially,
170 // eg during scavenging or possibly while selecting a breeding set,
171 // this cache should be significantly larger than the population size to be of most value.
172 // This can be cleared entirely during extreme memory stress.
173 _gSI_cache = SimpleLRUMapAutoSizeForHitRate.<String, ScorerIF>create(0, 1 + ((3*maxPopulationSize)/2), "_gSI_cache");
174 }
175
176 /**Comparator used to rank Scorers relative to one another by increasing goodness, never null.
177 * Scorer values not currently represented in the population
178 * are treated as if goodness 0 (ie neutral).
179 * <p>
180 * If both goodness values are available then a secondary ranking is done
181 * by increasing score.
182 * <p>
183 * Complete ties are broken by Scorer name.
184 */
185 public final Comparator<String> SCORER_RANK_COMPARATOR;
186
187 /**If true then we may kick out apparent-near duplicates as the population reaches maximum.
188 * The aim of doing this is to increase diversity of the 'gene pool' for a given capacity,
189 * by making space for significantly different genotypes
190 * and avoiding them being swamped or pushed out by many near-clones of the overall best Scorer
191 * in any given sub-population.
192 * <p>
193 * This scavenging may have a random element.
194 * <p>
195 * Scavenging may be quite slow.
196 */
197 private static final boolean SCAVENGE_FOR_DIVERSITY = true;
198
199 /**Reset/clear the population to its initial (empty) state.
200 * Does not shut out all other activity,
201 * so it is possible that new items may be inserted while this is running.
202 */
203 public void clear()
204 {
205 // First clear the back-stop cache.
206 _backstopLRU.clear();
207 // Now systematically clear the main data stores and caches under the data lock.
208 synchronized(_lock)
209 {
210 _clearCaches();
211 _scorerScores.clear();
212 for(final NavigableSet<String> nm : _scorerByGoodness.values())
213 { nm.clear(); }
214 }
215 // Finally clear out the Scorer instance cache.
216 _gSI_cache.clear();
217 }
218
219 /**Construct with a list of the base Scorers (names, no parameters) to keep track of.
220 * Note that we may slightly exceed the maximum specified population size for expediency,
221 * and we may dump some (less good) part of the population under memory stress.
222 * <p>
223 * This automatically posts new best-of-breed Scorers to the system variables event store
224 * as they are cached.
225 *
226 * @param baseScorers non-null, non-empty, pure-base-name seed set
227 * @param maxPopulationSize maximum number of Scorer details to retain in memory,
228 * possibly excluding the fixed/base Scorers, strictly positive;
229 * this value is advisory and may be slightly exceeded for performance
230 * @param vars system variables pipeline to record new/best Scorer events to
231 */
232 public ScorerPopulation(final Map<String, ScorerIF> baseScorers,
233 final int maxPopulationSize,
234 final SimpleExhibitPipelineIF vars)
235 {
236 this(baseScorers,
237 (new NewBestCallbackIF() {
238 public final void reportNewBestScorer(final String nameAndParameters)
239 {
240 final SimpleVariableValue svvL = new SimpleVariableValue(SystemVariables.AI_SCORER_STRING_LOCAL_EVENT,
241 nameAndParameters);
242 try { vars.setVariable(svvL); }
243 catch(final IOException e) { e.printStackTrace(); /* Complain but continue after error. */ }
244 try
245 {
246 if(USE_GLOBAL_EVENTS_ALSO &&
247 "true".equalsIgnoreCase(vars.getGenProps(-1).getGen().get(GEN_PROPS_GLOBAL_POST_ENABLE_SUFFIX)))
248 {
249 final SimpleVariableValue svvG = new SimpleVariableValue(SystemVariables.AI_SCORER_STRING_GLOBAL_EVENT,
250 nameAndParameters);
251 try { vars.setVariable(svvG); }
252 catch(final IOException e) { e.printStackTrace(); /* Complain but continue after error. */ }
253 }
254 }
255 catch(final IOException e) { e.printStackTrace(); /* Complain but continue after error. */ }
256 }
257 }),
258 maxPopulationSize);
259 }
260
261 /**Maximum number of Scorer details to retain, possibly excluding the fixed/base Scorers, strictly positive.
262 * Must in fact be large than the number of base Scorer types,
263 * since we will need at least once slot for each non-parameterised type.
264 */
265 private final int maxPopulationSize;
266
267 /**Immutable Map from base Scorer names (no parameters) to base/parameterless instances to monitor/maintain; non-null, non-empty. */
268 private final Map<String, ScorerIF> baseScorers;
269
270 /**Set of class values for the supported Scorer types for fast validation; never null. */
271 private final Set<Class<? extends ScorerIF>> supportedScorerClasses;
272
273 /**Callback to report new best-of-breed being inserted into cache; may be null. */
274 private final NewBestCallbackIF callback;
275
276 /**A back-stop LRU cache of Scorer goodness, so that in particular the results of poor Scorers need not be excessively recomputed; never null.
277 * Normally we only keep the ScoreAndConf of the best Scorers,
278 * but if we happen to be repeatedly asked for the results of poor Scorers
279 * and we don't hold those results because they are not good to be in the main population
280 * then we may simply waste disproportionate time recomputing them.
281 * <p>
282 * This aims to be a bonus back-stop cache of all results not in the main cache,
283 * and is not strictly part of the counted population
284 * because these are not part of the potential "breeding set":
285 * they are here only for "negative cacheing" of poor Scorer parameters sets.
286 * <p>
287 * We keep it much smaller than the main population
288 * so that it is not a significant resource drain.
289 * <p>
290 * We push Scorers into it when they are displaced from the main population.
291 * <p>
292 * Thread-safe.
293 * <p>
294 * It is permissible and viable to hold a lock on this object over multiple operations
295 * to make them into one compound atomic operation.
296 */
297 private final SimpleLRUMapAutoSizeForHitRate<String, Pair<Long, ScoreAndConf>> _backstopLRU;
298
299 /**Non-user-visible lock to serialise activity on _scorerScores and _scorerByGoodness; never null.
300 * No blocking/external calls are made in the scope of this lock
301 * so as to ensure that it is held only for very short intervals.
302 */
303 private final Object _lock = new Object();
304
305 /**Thread-safe cache of Scorer scores (and time when cached), indirectly size-limited; never null.
306 * Map from scorer name-and-parameters to ScoreAndConfidence and the time computed/cached.
307 * <p>
308 * Single get() accesses to this can be made safely without further locking
309 * so as to maximise concurrency.
310 * <p>
311 * Note that since a put() operation does not lock out get() operations,
312 * any updates on the main _scorerScores map must be incrementally valid,
313 * ie incorrect data must never be inserted into the _scorerScores map.
314 */
315 private final ConcurrentMap<String, Pair<Long, ScoreAndConf>> _scorerScores;
316
317 /**Map from base Scorer name to a least-good-first NavigableSet of full nameAndParameters; never null.
318 * We pre-populate this with empty maps under each base name that we handle
319 * (so it is never necessary to test for presence in, or ever alter, this outer map; hence it is immutable).
320 * <p>
321 * The top-level Comparator is selected to sort by goodness derived from the _scorerScores,
322 * so any extant entry in the nameAndParameters should be removed from the _scorerByGoodness map
323 * before removal from or replacement in the _scorerScores map,
324 * and re-inserted afterwards as necessary.
325 * <p>
326 * Since the different base Scorer names/types are held separately,
327 * this helps keep the probably-distinct populations from competing with one another,
328 * though the overall population size cap is still present.
329 * <p>
330 * Because of a potential lack of symmetry in scoring
331 * (ie reversing the prediction of a bad Scorer does not necessarily make it a good one)
332 * we are interested in preserving the highest-confidence, positive-correlation Scorers.
333 * <p>
334 * Not thread-safe; access only under _lock.
335 */
336 private final Map<String, NavigableSet<String>> _scorerByGoodness;
337
338 /**ScoreAndConfidence for the given Scorer over all exhibit types; null if not known.
339 * Never recomputes, always fetches from its cache if present, else returns null.
340 * <p>
341 * Older results (typically up to 1 day older) may be returned if allowStale is true;
342 * this may help the system overall ride out temporary I/O problems and disconnections.
343 * <p>
344 * These get operations are generally fast,
345 * though some cycle-stealing incremental housekeeping might be performed on this call.
346 *
347 * @param scorer the Scorer instance; never null
348 * @param allowStale if true then allow an older cache response
349 * (up to VLONG period older)
350 *
351 * @return the score represents the correlation with the underlying votes
352 * (and whatever the scoring is measured against)
353 * with MAX meaning perfect correlation, 0 meaning no correlation,
354 * and -MAX meaning perfectly wrong answers all the time,
355 * and the confidence 0 if we have no (or very few) data points yet
356 * and approaching MAX as we have a large (enough) number of data points
357 *
358 * @throws IllegalArgumentException if the Scorer is null
359 */
360 public ScoreAndConf getScorerWeighting(final ScorerIF scorer,
361 final boolean allowStale)
362 {
363 if(scorer == null) { throw new IllegalArgumentException(); }
364 return(getScorerWeighting(scorer.getNameAndParameters(), allowStale));
365 }
366
367 /**ScoreAndConfidence for the given Scorer over all exhibit types; null if not known.
368 * Never recomputes, always fetches from its cache if present, else returns null.
369 * <p>
370 * Older results (typically up to 1 day older) may be returned if allowStale is true;
371 * this may help the system overall ride out temporary I/O problems and disconnections.
372 * <p>
373 * These get operations are generally fast,
374 * though some cycle-stealing incremental housekeeping might be performed on this call.
375 *
376 * @param scorerNameAndParameters the name and parameters of the Scorer; never null
377 * @param allowStale if true then allow an older cache response
378 * (up to VLONG period older)
379 *
380 * @return the score represents the correlation with the underlying votes
381 * (and whatever the scoring is measured against)
382 * with MAX meaning perfect correlation, 0 meaning no correlation,
383 * and -MAX meaning perfectly wrong answers all the time,
384 * and the confidence 0 if we have no (or very few) data points yet
385 * and approaching MAX as we have a large (enough) number of data points
386 */
387 public ScoreAndConf getScorerWeighting(final String scorerNameAndParameters,
388 final boolean allowStale)
389 {
390 if(scorerNameAndParameters == null) { throw new IllegalArgumentException(); }
391
392 final long currentPeriod = EventPeriod.VLONG.getIntervalNumber(System.currentTimeMillis());
393
394 // We don't need to validate the requested name (other than as being non-null)
395 // since an invalid value will correctly return null.
396 // We don't need to take any locks for this single get();
397 // this helps maximise concurrency and can proceed
398 // even when a long/slow put() operation is taking place in another thread.
399 final Pair<Long, ScoreAndConf> cachedValue = _scorerScores.get(scorerNameAndParameters);
400 // We can use a value from cache if the cached value is from the current period
401 // (or the previous period if allowStale is true).
402 if((cachedValue != null) &&
403 (EventPeriod.VLONG.getIntervalNumber(cachedValue.first) >= (allowStale ? currentPeriod-1 : currentPeriod)))
404 { return(cachedValue.second); }
405
406 // Try back-stop LRU cache.
407 // We access this out of the scope of the main lock to maximise concurrency.
408 // Note that we allow use of older values than from the main population.
409 // We actively remove expired items to make more room in this back-stop cache.
410 synchronized(_backstopLRU)
411 {
412 final Pair<Long, ScoreAndConf> backstopResult = _backstopLRU.get(scorerNameAndParameters);
413 if(backstopResult != null)
414 {
415 final long oldestRetained = currentPeriod-2;
416 final long cachedItemAge = EventPeriod.VLONG.getIntervalNumber(backstopResult.first);
417 if(cachedItemAge < oldestRetained)
418 {
419 // Remove expired value to make room for others.
420 _backstopLRU.remove(scorerNameAndParameters);
421 }
422 else if(cachedItemAge >= (allowStale ? oldestRetained : currentPeriod))
423 {
424 //if(IsDebug.isDebug) { System.out.println("[ScorerPopulation.getScorerWeighting(): caught lookup with back-stop LRU cache.]"); }
425 return(backstopResult.second);
426 }
427 }
428 }
429
430 // Nothing in either cache...
431 return(null);
432 }
433
434 /**GenProps 'generic' parameter that must be set for us to post to the global event store. */
435 public static final String GEN_PROPS_GLOBAL_POST_ENABLE_SUFFIX = "ai.scorerGlobalPostEnable";
436
437 /**Save the ScoreAndConfidence for the given Scorer over all exhibit types.
438 * Saves the given Scorer and "goodness",
439 * if necessary evicting a poorly-performing Scorer to make room
440 * ie to limit the population size.
441 * <p>
442 * If the "put" Scorer is very good compared to the existing population members
443 * then it may get persisted/shared via the event mechanism.
444 * <p>
445 * This routine will only consider storing entries for Scorers whose base name
446 * is on the list we were constructed with.
447 * <p>
448 * Note that a put() operation does not lock out get() operations,
449 * so any updates on the main _scorerScores map must be incrementally valid,
450 * ie incorrect data must never be inserted into the _scorerScores map.
451 * <p>
452 * This will attempt to cache the actual Scorer instance provided
453 * which may save time reconstructing it later, and memory by better sharing.
454 * This instance may be recovered with getScorerInstance() later,
455 * though may have to be recreated (ie returning the same instance is never guaranteed).
456 *
457 * @param scorer the Scorer instance; never null
458 * @param sac rating of this Scorer across all exhibits; never null
459 * @param dataSource to record/persist/share good Scorers through;
460 * null if no Scorer persistence is required
461 * @param log logger; never null
462 *
463 * @return true if the Scorer result is new for its sub-population and best in it
464 * and has a positive overall "goodness" (ie has some positive predictive ability)
465 *
466 * @throws IllegalArgumentException if the Scorer is null or of an unsupported type
467 */
468 public boolean putScorerWeighting(final ScorerIF scorer,
469 final ScoreAndConf sac,
470 final SimpleExhibitPipelineIF dataSource,
471 final SimpleLoggerIF log)
472 {
473 if(scorer == null) { throw new IllegalArgumentException(); }
474 if(!supportedScorerClasses.contains(scorer.getClass())) { throw new IllegalArgumentException("unsupported Scorer class"); }
475 final String nameAndParameters = scorer.getNameAndParameters();
476 _gSI_cache.put(nameAndParameters, scorer);
477 return(putScorerWeighting(nameAndParameters, sac, dataSource, log));
478 }
479
480 /**Save the ScoreAndConfidence for the given Scorer over all exhibit types.
481 * Saves the given Scorer and "goodness",
482 * if necessary evicting a poorly-performing Scorer to make room
483 * ie to limit the population size.
484 * <p>
485 * If the "put" Scorer is very good compared to the existing population members
486 * then it may get persisted/shared via the event mechanism.
487 * <p>
488 * This routine will only consider storing entries for Scorers whose base name
489 * is on the list we were constructed with.
490 * <p>
491 * Note that a put() operation does not lock out get() operations,
492 * so any updates on the main _scorerScores map must be incrementally valid,
493 * ie incorrect data must never be inserted into the _scorerScores map.
494 *
495 * @param nameAndParameters the name and parameters of the scorer; never null
496 * @param sac rating of this Scorer across all exhibits; never null
497 * @param dataSource to record/persist/share good Scorers through;
498 * null if no Scorer persistence is required
499 * @param log logger; never null
500 *
501 * @return true if the Scorer result is new for its sub-population and best in it
502 * and has a positive overall "goodness" (ie has some positive predictive ability)
503 */
504 public boolean putScorerWeighting(final String nameAndParameters,
505 final ScoreAndConf sac,
506 final SimpleExhibitPipelineIF dataSource,
507 final SimpleLoggerIF log)
508 {
509 if(sac == null) { throw new IllegalArgumentException(); }
510 if(nameAndParameters == null) { throw new IllegalArgumentException(); }
511
512 final Pair<String, Map<String, String>> nap = AbstractScorer.parseNameAndParameters(nameAndParameters);
513 final String baseName = nap.first;
514 // Ignore if not one of our recognised base Scorers.
515 if(!baseScorers.keySet().contains(baseName)) { return(false); }
516 final int goodness = ScoreAndConf.computeScorerGoodness(sac);
517 final boolean parameterless = nap.second.isEmpty();
518
519 final long startTime = System.currentTimeMillis();
520
521 // Set true if we discover that this is the best Scorer of its type so far
522 // so we should save/share it.
523 boolean isNewAndBest = false;
524
525 // Still a candidate for storing, so grab a lock...
526 synchronized(_lock)
527 {
528 // Clear caches within the lock scope to avoid/minimise races.
529 _clearCaches();
530
531 final SortedSet<String> insertedScorerSubPopulation = _scorerByGoodness.get(baseName);
532 // Compute previous 'best' score in this sub-population if any.
533 final int prevBest = insertedScorerSubPopulation.isEmpty() ? Integer.MIN_VALUE :
534 ScoreAndConf.computeScorerGoodness(_scorerScores.get(insertedScorerSubPopulation.last()).second);
535
536 // Remove and return any extant value for this Scorer value, null if none previously.
537 final Pair<Long, ScoreAndConf> oldVal = _remove(nameAndParameters);
538
539 // Insert the value: first in _scorerScores, then in the dependent _scorerByGoodness.
540 // We DO NOT intern() the possibly-long name because doing so would stress the PermGen.
541 _scorerScores.put(nameAndParameters, new Pair<Long, ScoreAndConf>(startTime, sac));
542 insertedScorerSubPopulation.add(nameAndParameters);
543
544 // Test if this is the best Scorer in its sub-population
545 // and has a non-zero goodness.
546 // Note that this has already been inserted into that sub-population,
547 // so cannot be "better",
548 // so we test against the previous 'best' value if any.
549 assert(goodness <= ScoreAndConf.computeScorerGoodness(_scorerScores.get(insertedScorerSubPopulation.last()).second));
550 if((goodness > 0) && (goodness > prevBest) && (oldVal == null))
551 { isNewAndBest = true; }
552
553 // If the population is oversize following this insertion
554 // then we'll cull the worst Scorer from the largest sub-population if possible.
555 if(_scorerScores.size() > maxPopulationSize)
556 {
557 // Find the largest sub-population.
558 int largestSubPopulation = -1;
559 String largestSubPopulationBasename = null;
560 for(final String subPopBaseName : _scorerByGoodness.keySet())
561 {
562 final int subPopSize = _scorerByGoodness.get(subPopBaseName).size();
563 if(subPopSize > largestSubPopulation)
564 {
565 largestSubPopulation = subPopSize;
566 largestSubPopulationBasename = subPopBaseName;
567 }
568 }
569 assert(largestSubPopulation > 0);
570 assert(largestSubPopulationBasename != null);
571
572 // Don't attempt scavenging all the time, since it may be very expensive, ie O(n).
573 // We try to reduce the average time scavenging to about O(1) on average.
574 // Scavenging is likely to work more effectively per-call if called infrequently.
575 if(SCAVENGE_FOR_DIVERSITY)
576 {
577 // Attempt one scavenging (clone-stripping) run on this (largest) sub-population.
578 _scavenge(largestSubPopulationBasename);
579 }
580
581 // If scavenging failed to recover any space then evict the least-good Scorer.
582 if(_scorerScores.size() > maxPopulationSize)
583 {
584 // ZAP LEAST-GOOD SCORER SO AS TO CAP THE POPULATION SIZE IF (STILL) OVERSIZE...
585 // Iterate through the largest population
586 // culling the first (least good) parameterised Scorer(s) found, if any.
587 // We may not be able to remove anything, leaving the population slightly oversized.
588 final SortedSet<String> victimSubPop = _scorerByGoodness.get(largestSubPopulationBasename);
589 for(final String putativeVictim : victimSubPop)
590 {
591 // We never remove the parameterless base Scorer from its sub-population.
592 if(putativeVictim.indexOf(AbstractScorer.SEPARATOR) == -1) { continue; }
593 // We push this about-to-be-zapped victim into the back-stop LRU cache...
594 _backstopLRU.put(putativeVictim, _scorerScores.get(putativeVictim));
595 // Now remove the victim from the main data stores.
596 final Pair<Long, ScoreAndConf> zapped = _remove(putativeVictim);
597 assert(zapped != null); // Must have actually removed the specified 'poor' Scorer.
598 // Having trimmed the population (and altered the collections) we must now stop.
599 break;
600 }
601 }
602 }
603 }
604
605 // If this a new/best +ve-goodness Scorer (in its sub-population) then post it locally
606 // if not or parameterless, and if we have a live dataSource.
607 // This is done outside of the lock scope
608 // to avoid blocking other get/put operations during I/O.
609 // We absorb (and report) any I/O error, but continue.
610 if(isNewAndBest && !parameterless && (callback != null))
611 {
612 if(IsDebug.isDebug) { log.log("[ScorerPopulation: new best Scorer: "+nameAndParameters+" "+sac+", 'goodness'="+goodness+"...]"); }
613 callback.reportNewBestScorer(nameAndParameters);
614 }
615
616 return(isNewAndBest);
617 }
618
619 /**Remove and return any extant value for this Scorer value, null if none previously.
620 * Must only be called under the data lock to be atomic wrt the data stores.
621 *
622 * @param nameAndParameters non-null valid Scorer value incluing optional parameters
623 *
624 * @return previous timestamp/value for this Scorer, else null if nne
625 */
626 private synchronized Pair<Long, ScoreAndConf> _remove(final String nameAndParameters)
627 {
628 assert(Thread.holdsLock(_lock)); // This lock must be held for safety.
629
630 // See if we have an existing entry for this nameAndParameters.
631 // (If not, there is nothing to remove.)
632 final Pair<Long, ScoreAndConf> oldVal = _scorerScores.get(nameAndParameters);
633 if(oldVal == null) { return(null); }
634 // If we do, then remove the corresponding entry from _scorerByGoodness first
635 // else we will break its comparator/lookup which depends on _scorerScores.
636 // Quickly extract the base name of the Scorer to select the sub-population.
637 final int firstSep = nameAndParameters.indexOf(AbstractScorer.SEPARATOR);
638 final String baseName = (firstSep == -1) ? nameAndParameters : (nameAndParameters.substring(0, firstSep));
639 // Remove from the appropriate sub-population first...
640 final SortedSet<String> subPopulation = _scorerByGoodness.get(baseName);
641 assert (subPopulation != null);
642 assert (subPopulation.contains(nameAndParameters));
643 subPopulation.remove(nameAndParameters);
644 assert (!subPopulation.contains(nameAndParameters));
645 // Then remove from the main mapping, ie _scorerScores.
646 _scorerScores.remove(nameAndParameters);
647 return(oldVal);
648 }
649
650 /**Countdown to next scavenge: when zero then scavenge and add on current population size. */
651 private final AtomicInteger countdownToNextScavenge = new AtomicInteger();
652
653 /**Minimum 'delta' in score for two Scorers to be considered potentially similar; non-negative.
654 * This is the minimum difference in the raw score, not weighted by confidence,
655 * for two Scorers on similar overall goodness to be considered 'similar',
656 * given that we will then further filter by genotype and/or phenotype,
657 * the assumption being that differences of this scale may just be (rounding and other) noise.
658 * <p>
659 * A good value for this is probably less than 1% of maximum,
660 * even though many Scorers will struggle to be much better than chance than this.
661 * <p>
662 * Note that we may apply a tighter test to values very close to zero.
663 * <p>
664 * A value of zero indicates that scores have to be identical
665 * to be considered similar by this measure.
666 */
667 private static final int SIMILAR_SCORE_DELTA = Math.max(2, ScoreAndConf.MAX_GOODNESS >>> 7);
668
669 /**If true then if a minor scavenge fails always force a major scavenge.
670 * This may be expensive but more-or-less guarantees
671 * that there will always be some space in the population
672 * for marginal but unusual Scorers to have a chance to be bred from.
673 */
674 private static final boolean FORCE_MAJOR_COLLECTION_AFTER_MINOR_FAILS = true;
675
676 /**Attempt to strip out near-clones from the population to make space for more diversity.
677 * This works on the assumption that there may be many valuable Scorer values
678 * that while not optimal across all exhibits or at this very moment,
679 * may be valuable on sub-sets of the exhibits now or in the near future.
680 * This routine attempts to prevent limited population space (gene-pool capacity)
681 * from being overrun with very slight variations on (effective clones of) a monoculture.
682 * <p>
683 * Currently this is done by elimination of genotypic near-clones
684 * by eliminating those Scorers that have very similar Scores and no sufficiently-distinct genes.
685 * <p>
686 * TODO: In future this may be done by elimination of phenotypic near clones
687 * by eliminating those Scorers that are have the same predction performance over a given calbration set.
688 * <p>
689 * This may not trim out anything from the specified sub-population.
690 * <p>
691 * This will never remove the 'best' item nor the base/parameterless Scorer.
692 * <p>
693 * Values removed by scavenging may be pushed into the back-stop LRU cache
694 * in case they are used again soon by the outside world;
695 * thus we may limit the amount of any scavengine we do in one go
696 * so as to avoid overflowing this back-stop cache.
697 * <p>
698 * This tries a series of more drastic pruning measures until it zaps some/enough Scorers,
699 * with the inital pruning passes being very quick and gentle.
700 * <p>
701 * This does not guarantee to remove any Scorers at all.
702 * <p>
703 * This must be called under the data lock.
704 * <p>
705 * Run time is expected to average about O(1) (peak O(n)) for a size-n sub-population.
706 *
707 * @param largestSubPopulationBasename
708 */
709 private void _scavenge(final String largestSubPopulationBasename)
710 {
711 assert((largestSubPopulationBasename != null) && (_scorerByGoodness.get(largestSubPopulationBasename) != null));
712 assert(Thread.holdsLock(_lock)); // This lock must be held for safety.
713
714 final long startTime = System.currentTimeMillis();
715 final long currentTick = EventPeriod.VLONG.getIntervalNumber(startTime);
716
717 // Compute the maximum number of Scorers to attempt to evict in this run.
718 final NavigableSet<String> subPop = _scorerByGoodness.get(largestSubPopulationBasename);
719
720 // Record the size of the sub-population when we start the scavenge.
721 final int startSubPopSize = subPop.size();
722
723 // MINOR scavenge limits.
724 // Cap this to a very slow-growing fraction of the sub-population size
725 // so as to keep the mean run-time very close to O(1).
726 final int maxMinorPurgeCount = Math.min(2+GenUtils.log2Approx(startSubPopSize), startSubPopSize-2);
727 // Return immediately if not big enough for us to do even a decent minor scavenge from.
728 if(maxMinorPurgeCount < 2) { return; }
729
730 //if(IsDebug.isDebug) { System.out.println("[ScorerPopulation: scavenging... Sub-population size = "+(startSubPopSize)+".]"); }
731
732 // The set of Scorers that we wish to evict...
733 final Set<String> evictees = new HashSet<String>(16 + 2*maxMinorPurgeCount);
734
735 // Do MINOR purge...
736 // Remove anything from the extreme tail end (least goodness) part of the sub-population:
737 // * Without merit (score < 0).
738 // * Very similar to any item better than it in this tail.
739 //
740 // We copy this always-very-small tail into a new List, worst-first.
741 final List<String> tail = new ArrayList<String>(maxMinorPurgeCount);
742 final Iterator<String> tailIt = subPop.iterator();
743 for(int i = maxMinorPurgeCount; --i >= 0; ) { tail.add(tailIt.next()); }
744 assert(tail.size() == maxMinorPurgeCount);
745 assert(tail.size() < startSubPopSize); // Ensure that we can never zap the 'best' Scorer by mistake.
746 for(int i = 0; i < maxMinorPurgeCount; ++i)
747 {
748 final String n = tail.get(i);
749 // Never try and remove the base/parameterless Scorer from this sub-population.
750 if(n.indexOf(AbstractScorer.SEPARATOR) == -1) { continue; }
751 final ScoreAndConf sac = _scorerScores.get(n).second;
752 // Zap anything without a positive score/confidence.
753 if((sac.score <= 0) || (sac.confidence == 0))
754 {
755 evictees.add(n);
756 _backstopLRU.put(n, _scorerScores.get(n)); // Push this into the back-stop list.
757 continue;
758 }
759 // Zap anything very at all to any Scorer better than it in the tail.
760 final ScorerIF sc = getScorerInstance(n);
761 assert(n != null);
762 for(int j = i+1; j < maxMinorPurgeCount; ++j)
763 {
764 // We need to be at least a little different to everything else better than us
765 // if we are to be retained.
766 if(AbstractScorer.verySimilar(sc, getScorerInstance(tail.get(j))))
767 {
768 assert(!n.equals(tail.get(j))); // All Scorers in the tail must be different...
769 evictees.add(n);
770 _backstopLRU.put(n, _scorerScores.get(n)); // Push this into the back-stop list.
771 break;
772 }
773 }
774 }
775
776 // If the minor scavenge identified any poor Scorer
777 // then zap them and avoid a major/slow scavenge if more than one was found.
778 if(!evictees.isEmpty())
779 {
780 // Must remove from sub-population before removing from _scorerScores.
781 subPop.removeAll(evictees);
782 _scorerScores.keySet().removeAll(evictees);
783
784 // We can avoid a major collection if we did better than simply trimming one entry
785 // which the quick-and-easy default non-scavenge mechanism will do anyway.
786 if(evictees.size() > 1)
787 {
788 //if(IsDebug.isDebug) { System.out.println(" [ScorerPopulation: minor scavenge successful "+evictees.size()+'/'+startSubPopSize+" Scorers.]"); }
789 return;
790 }
791
792 evictees.clear(); // Don't need to carry these over to the major collection.
793 // Fall through to consider doing a major scavenge...
794 }
795
796
797 // MAJOR scavenge limits.
798 // Cap this to a small fraction of the sub-population size so as not to extinguish too much at once.
799 // Cap this also to a fraction of the LRU backstop cache size so as not to overwhelm/flush it.
800 final int maxPurgeCount = Math.min(subPop.size()/8, _backstopLRU.size()/2);
801 // Return immediately if not big enough for us to scavenge from...
802 if(maxPurgeCount < 1) { return; }
803
804 // After failure of a minor scavenge to make much/any space
805 // we may wish to fall back to our default kill-the-worst-one tactic,
806 // or we may try to force a major collection likely to reclaim a decent chunk immediately.
807 if(!FORCE_MAJOR_COLLECTION_AFTER_MINOR_FAILS)
808 {
809 // Most of the time, quit before slow/major scavenges to keep the mean execution time low.
810 // We only return on > 0 so that this will fail and fall through the first time.
811 if(countdownToNextScavenge.getAndDecrement() > 0) { return; }
812 // Postpone the next major scavenge enough to keep its contribution ~ O(1).
813 countdownToNextScavenge.addAndGet(1+maxPurgeCount);
814 }
815
816 //if(IsDebug.isDebug) { System.out.println("[ScorerPopulation: scavenging (MAJOR)... Sub-population size = "+(subPop.size())+".]"); }
817
818 // Work through the Scorers from worst to best,
819 // stopping when we've found enough candidates.
820 // We examine the Scorers in pairs,
821 // deleiberately leaving at least alternate values untouched on any one pass.
822 for(final Iterator<String> it = subPop.iterator(); it.hasNext() && (evictees.size() < maxPurgeCount); )
823 {
824 // Get 'less good' of the two Scorers that we will examine.
825 final String worse = it.next();
826
827 // Never try and scavenge/remove the best (last) Scorer in this sub-population...
828 if(!it.hasNext()) { break; }
829 // Never try and remove the base/parameterless Scorer from this sub-population.
830 if(worse.indexOf(AbstractScorer.SEPARATOR) == -1) { continue; }
831
832 // Get the 'better' of the two Scorers compared this time.
833 assert(it.hasNext()); // We know from earlier test that there is at least one more Scorer.
834 final String better = it.next();
835
836 // If the worse of the pair is sufficiently old that we wouldn't return it even with allowStale
837 // then dump it immediately (don't even push it into the back-stop cache).
838 // This doesn't have to exactly match the cache behaviour (though doing so is good),
839 // but simply allows us to kick out very old Scorers that have not been re-evaluated at all recently
840 // so may be dead wood consuming valuable space.
841 final Pair<Long, ScoreAndConf> pairW = _scorerScores.get(worse);
842 final long tickWhenCached = EventPeriod.VLONG.getIntervalNumber(pairW.first);
843 if(currentTick - tickWhenCached > 1)
844 {
845 evictees.add(worse);
846 //if(IsDebug.isDebug) { System.out.println(" [ScorerPopulation: scavenged old Scorer: " + _scorerScores.get(worse) + ".]"); }
847 continue;
848 }
849
850 // Only proceed if the scores of the two Scorers are reasonably similar.
851 final ScoreAndConf sacW = pairW.second;
852 final ScoreAndConf sacB = _scorerScores.get(better).second;
853 // For scores very close to zero we only treat identical scores as similar...
854 if((SIMILAR_SCORE_DELTA == 0) || (Math.abs(sacB.score) < (SIMILAR_SCORE_DELTA<<4)))
855 { if(sacW.score == sacB.score) { continue; } }
856 else if(Math.abs(sacW.score - sacB.score) > SIMILAR_SCORE_DELTA) { continue; }
857
858 // Check for genotypic similarity...
859 // Don't discard a 'worse' Scorer sufficiently different to its 'better'.
860 if(!AbstractScorer.verySimilar(getScorerInstance(worse), getScorerInstance(better))) { continue; }
861
862 // Mark this Scorer for removal (and push into the back-stop cache immediately).
863 evictees.add(worse);
864 _backstopLRU.put(worse, _scorerScores.get(worse));
865 //if(IsDebug.isDebug) { System.out.println(" [ScorerPopulation: scavenged 'clone' Scorer: " + _scorerScores.get(worse) + ".]"); }
866 }
867
868 // If we didn't manage to evict more than one Scorer with out subtle weeding
869 // (which the quick-and-easy default non-scavenge mechanism would do anyway)
870 // then run an increasingly harsh filter until we fill our quota
871 // so as to pay for the time spent scavenging so far.
872 // We still will never evict the best Scorer,
873 // nor more than a small subset of all Scorers,
874 // and we still start evicting the poorest Scorers overall,
875 // so at worst this is probably similar to having a somewhat smaller maximum population size.
876 if(evictees.size() <= 1)
877 {
878 // Require adjacent Scorers by 'goodness' to be less and less similar to survive each pass
879 // until we manage to force enough Scorers out to give lots of elbow room for new entrants
880 // or all remaining Scorers really seem (genotypically) different
881 // (or we are effectively just forcing Scorers out by lottery).
882 final int nParams = getScorerInstance(subPop.first()).getParameterDefsAndValues().size();
883 // Start with small steps, ie using the finest razor available,
884 // but limit the maximum number of passes even for large parameter sets,
885 // to very approximately O(log(nParams)) unless we make good progress.
886 // We double the step if we did not find new evictees on the previous pass,
887 // else if we did clear some then we set the step back to 1 for maximum resolution.
888 boolean clearedOnLastPass; // True if we cleared anything on the last pass.
889 int step = 1; // Current similarity (decrease) step; strictly positive.
890 for(int similarity = nParams; (similarity > 0) && (evictees.size() < maxPurgeCount); similarity -= step, step = (clearedOnLastPass ? 1 : (2*step)))
891 {
892 //if(IsDebug.isDebug) { System.out.println("_scavenge(): nParams="+nParams+", similarity="+similarity+", step="+step+", evictees.size()="+evictees.size()); }
893 clearedOnLastPass = false;
894 for(final Iterator<String> it = subPop.iterator(); it.hasNext() && (evictees.size() < maxPurgeCount); )
895 {
896 // Get 'less good' of the two Scorers that we will examine.
897 final String worse = it.next();
898
899 // Never try and scavenge/remove the best (last) Scorer in this sub-population...
900 if(!it.hasNext()) { break; }
901 // Never try and remove the base/parameterless Scorer from this sub-population.
902 if(worse.indexOf(AbstractScorer.SEPARATOR) == -1) { continue; }
903 // Skip any Scorer already waiting to be purged...
904 if(evictees.contains(worse)) { continue; }
905
906 // Get the 'better' of the two Scorers compared this time.
907 assert(it.hasNext()); // We know from earlier test that there is at least one more Scorer.
908 final String better = it.next();
909 // Skip any Scorer already waiting to be purged...
910 if(evictees.contains(better)) { continue; }
911
912 // Check for genotypic similarity...
913 // Don't discard a 'worse' Scorer sufficiently different to its 'better'.
914 if(!AbstractScorer.similarNParams(getScorerInstance(worse), getScorerInstance(better), similarity)) { continue; }
915
916 // Mark this Scorer for removal (and push into the back-stop cache immediately).
917 evictees.add(worse);
918 _backstopLRU.put(worse, _scorerScores.get(worse));
919 clearedOnLastPass = true;
920 //if(IsDebug.isDebug) { System.out.println(" [ScorerPopulation: scavenged near ("+similarity+") 'clone' Scorer: " + _scorerScores.get(worse) + ".]"); }
921 }
922 }
923 }
924
925 // Now efficiently purge all the evictees (if any) en masse.
926 if(!evictees.isEmpty())
927 {
928 //if(IsDebug.isDebug) { System.out.println(" [ScorerPopulation: scavenged "+evictees.size()+'/'+_scorerScores.size()+" Scorers at "+((System.currentTimeMillis()-startTime)/evictees.size())+"ms/Scorer.]"); }
929 // Must remove from sub-population before removing from _scorerScores.
930 subPop.removeAll(evictees);
931 _scorerScores.keySet().removeAll(evictees);
932 }
933 }
934
935 /**Clear any caches which depend on the population state remaining unchanged.
936 * This clearing is done within the lock that protects the internal data structures
937 * (even though the caches can be safely accessed outside the lock)
938 * to minimise unwanted races (and eliminate them for lock-protected cache accesses).
939 */
940 private void _clearCaches()
941 {
942 synchronized(_lock)
943 {
944 // Clear any caches which may be otherwise invalid after an update.
945 _gCSWP_cache = null; // Will need to recompute the "best" Scorer set.
946 _gBS_cache.clear(); // Will need to recompute any breeding set.
947 }
948 }
949
950 /**If true, allow posting the global event for exchange of the best/new Scorers.
951 * The mechanism must also be enabled in the GenProps for this to happen.
952 */
953 private static final boolean USE_GLOBAL_EVENTS_ALSO = true;
954
955 /**Get current population size; non-negative.
956 * This value may slightly exceed the requested maximum.
957 * <p>
958 * This does not include the count of any entries in the back-stop LRU (negative) cache.
959 *
960 * @return count of unique Scorers that might be selected as part of a "breeding set"
961 */
962 public int size() { return(_scorerScores.size()); }
963
964 /**Return the target maximum population size; non-negative.
965 * The actual population size may exceed this somewhat; this is a target.
966 */
967 public int maxSize() { return(maxPopulationSize); }
968
969 /**Cache of result from getCurrentScorersWithParameters(); may be null if nothing cached.
970 * Initially null, and cleared on any update (put) to the population.
971 * <p>
972 * Any value cached here must be immutable for safety and sanity.
973 * <p>
974 * Marked volatile for thread-safe lock-free access.
975 */
976 private volatile Set<String> _gCSWP_cache;
977
978 /**Current immutable set of available Scorer names and parameters (where applicable); never null but may be empty.
979 * The values returned are of the form <i>ScorerName{:name=value}*</i>.
980 * <p>
981 * The Scorers returned by this will generally be the best available,
982 * suitable for making a real prediction of and exhibit score,
983 * and will usually be the best one or handful per base-Scorer type,
984 * allowing that different Scorer types may have different "skill" domains,
985 * and excluding any with non-positive goodness.
986 * <p>
987 * This routine may be expensive, though will try to be efficient.
988 */
989 public Set<String> getCurrentScorersWithParameters(final boolean allowStale)
990 {
991 // Check the cache first.
992 // The cache is cleared by any put/update of the population.
993 final Set<String> cachedValue = _gCSWP_cache;
994 if(_gCSWP_cache != null) { return(cachedValue); }
995
996 // Result set, of only +ve goodness Scorers.
997 final Set<String> r = new HashSet<String>(4 * baseScorers.size());
998
999 // Hold a lock on the data structures while working.
1000 synchronized(_lock)
1001 {
1002 // Attempt to a candidate from each base type (sub population).
1003 for(final NavigableSet<String> subPop : _scorerByGoodness.values())
1004 {
1005 if(!subPop.isEmpty())
1006 {
1007 // Have the number of samples from each base name/type to include >= 1
1008 // (zero iff there are no eligible candidates at all)
1009 // and growing only (very) slowly with the population size.
1010 // In any case we try to return a diverse group from each sub-population.
1011 // We use log(log(n)) to allow for the possibility of multiple distinct useful candidates
1012 // in very large populations, without wasting too much time on many near clones.
1013 final int maxToInclude = Math.max(1, GenUtils.log2Approx(Math.max(1, GenUtils.log2Approx(subPop.size()))));
1014
1015 // Get the best Scorer of this type and its "goodness".
1016 final String best = subPop.last();
1017 final int bGoodness = ScoreAndConf.computeScorerGoodness(_scorerScores.get(best).second);
1018 if(bGoodness <= 0) { continue; /* All bad! */ }
1019 // We'll accept other Scorers nearly as good as the best,
1020 // but NOT of identical goodness or otherwise too similar (pheno/genotypically)
1021 // as we want (useful) variety in what we return, eg for multimodal matching.
1022 final int threshold = bGoodness/2;
1023 // Set of goodnesses so far: we don't return two identical.
1024 final Set<Integer> goodnesses = new HashSet<Integer>(2 * maxToInclude);
1025 // Set of scores so far: we don't return two identical.
1026 final Set<Short> scores = new HashSet<Short>(2 * maxToInclude);
1027 // Set of Scorer instances so far; we don't return two 'very similar'.
1028 final Collection<ScorerIF> instances = new ArrayList<ScorerIF>(maxToInclude);
1029 // Work backwards from the best (which is always included)...
1030 int remaining = maxToInclude;
1031 main: for(final String snp : subPop.descendingSet())
1032 {
1033 final ScoreAndConf sac = _scorerScores.get(snp).second;
1034 final Short score = sac.score;
1035 if(scores.contains(score)) { continue; /* Identical to a previous score. */ }
1036 scores.add(sac.score);
1037 final int g = ScoreAndConf.computeScorerGoodness(sac);
1038 // Stop if the goodness falls too far (or to/below zero).
1039 if(g <= threshold) { break; }
1040 assert(g > 0); // Only keeping Scorers with merit.
1041 final Integer G = g;
1042 if(goodnesses.contains(G)) { continue; /* Identical to a previous goodness. */ }
1043 goodnesses.add(G);
1044 // Make sure that this Scorer is not of very similar genotype to a previous one.
1045 final ScorerIF sc = getScorerInstance(snp);
1046 assert(sc != null);
1047 // We require increasingly-different Scorer instances as we gather more Scorers.
1048 final int maxSimilarity = Math.max(0, sc.getParameterDefsAndValues().size() - instances.size());
1049 for(final ScorerIF prevSc : instances)
1050 { if(AbstractScorer.similarNParams(prevSc, sc, maxSimilarity)) { continue main; /* Similar genotype to extant returnees. */ } }
1051 // Record new genetically-different Scorer.
1052 instances.add(sc);
1053 // This Scorer is good enough to return.
1054 r.add(snp);
1055 // Stop if we've collected enough.
1056 if(--remaining <= 0) { break; }
1057 }
1058 assert(r.contains(best));
1059 }
1060 }
1061 }
1062
1063 // We avoid cacheing empty results to assist with fast bootstrapping.
1064 if(r.isEmpty()) { return(Collections.emptySet()); }
1065 // Wrap the result to make it immutable, and cache it (unless empty).
1066 final Set<String> result = Collections.unmodifiableSet(r);
1067 _gCSWP_cache = result;
1068
1069 return(result);
1070 }
1071
1072 /**Cache of results for getBreedingSet(); never null.
1073 * Cleared by any put/update operation on the population.
1074 * <p>
1075 * Thread-safe map from (valid) baseName to breeding set.
1076 */
1077 private final ConcurrentMap<String, List<String>> _gBS_cache = new ConcurrentHashMap<String, List<String>>();
1078
1079 /**Get immutable random-access list of Scorers to "breed" from, best first; never null, possibly empty.
1080 * This returns all of (or some of the best of) the Scorers with the given base name.
1081 * <p>
1082 * This generally trims away all Scorers with non-positive values,
1083 * but will always try to return the few best items regardless,
1084 * ie the result is not completely empty unless the specified sub-population is.
1085 * <p>
1086 * The result will generally be large if the population is.
1087 */
1088 public List<String> getBreedingSet(final String baseName)
1089 {
1090 // Validate the base name first.
1091 if(baseName == null) { throw new IllegalArgumentException(); }
1092 if(!baseScorers.keySet().contains(baseName)) { throw new IllegalArgumentException(); }
1093
1094 final List<String> cachedValue = _gBS_cache.get(baseName);
1095 if(cachedValue != null) { return(cachedValue); }
1096
1097 // Hold lock while accessing mutable data.
1098 synchronized(_lock)
1099 {
1100 // Get in "worst-first" order.
1101 final ArrayList<String> r = new ArrayList<String>(_scorerByGoodness.get(baseName));
1102 // Convert to "best-first" order.
1103 Collections.reverse(r);
1104 final int size = r.size();
1105 final int limit = size/4;
1106 // Eliminate all those with non-positive goodness if possible,
1107 // but don't remove the best ~25% however poor they may appear to be
1108 // else we will not have any material to breed (and try to improve) from.
1109 for(int i = size; --i > limit; )
1110 {
1111 final int goodness = ScoreAndConf.computeScorerGoodness(_scorerScores.get(r.get(i)).second);
1112 if(goodness > 0) { break; }
1113 // Remove this trailing zero-goodness item.
1114 r.remove(i);
1115 }
1116 // Result must not be empty unless the sub-population is.
1117 assert(_scorerByGoodness.get(baseName).isEmpty() || !r.isEmpty());
1118
1119 // Wrap the result for immutablity and cache it.
1120 final List<String> result = Collections.unmodifiableList(r);
1121 _gBS_cache.putIfAbsent(baseName, result); // Reduce chance of update races.
1122
1123 return(result);
1124 }
1125 }
1126
1127 /**Interface of call-back hook to report a new "best" Scorer by name, eg to the system event record.
1128 */
1129 public interface NewBestCallbackIF
1130 {
1131 /**Used to post a new "best of breed" Scorer name and parameters for posterity.
1132 * This routine should not throw any exceptions.
1133 * @param scorerNameAndParameters valid name-and-parameters set; never null.
1134 */
1135 public void reportNewBestScorer(final String scorerNameAndParameters);
1136 }
1137
1138
1139 /**Small thread-safe LRU cache for getScorerInstance(); never null.
1140 * Intended to avoid repeated re-parsing of args and recreation of instances, etc,
1141 * for very-commonly-used Scorers used to evaluate every exhibit.
1142 * Usually we'd expect this to be a handful actually needed regularly,
1143 * but while hunting for new solutions (or scavenging for space)
1144 * we'll expect to churn this cache quite hard.
1145 */
1146 private final SimpleLRUMapAutoSizeForHitRate<String, ScorerIF> _gSI_cache;
1147
1148 /**Get Scorer instance given the Scorer{:value=name}* format; null if no such Scorer available.
1149 * Any instance returned may be a shared/cached instance rather than a new instance.
1150 * <p>
1151 * This does not attempt to canonicalise the names it is given,
1152 * so may create and cache duplicates for example.
1153 * <p>
1154 * This can be used to create any valid Scorer instance,
1155 * not just entries in the given population.
1156 */
1157 public ScorerIF getScorerInstance(final String nameAndParameters)
1158 {
1159 if(nameAndParameters == null) { throw new IllegalArgumentException(); }
1160
1161 // Try lookup in the cache; return a cached result without further ado!
1162 final ScorerIF cached = _gSI_cache.get(nameAndParameters);
1163 if(cached != null) { return(cached); }
1164
1165 // Quick extraction of base name.
1166 final int fs = nameAndParameters.indexOf(AbstractScorer.SEPARATOR);
1167 final boolean hasParams = (fs != -1);
1168 final String baseName = hasParams ? nameAndParameters.substring(0, fs) : nameAndParameters;
1169
1170 // Get a base instance of the Scorer, if any.
1171 final ScorerIF sc = baseScorers.get(baseName);
1172 // If this base name is not known/supported, stop now.
1173 if(sc == null) { return(null); }
1174
1175 // Fast-track the simple no-parameters case.
1176 if(!hasParams) { return(sc); }
1177
1178 try
1179 {
1180 // Use base non-parameterised instance to create variant.
1181 final ScorerIF newInstance = sc.createVariant(nameAndParameters);
1182 assert(newInstance != null);
1183 // Cache and return this value.
1184 _gSI_cache.put(nameAndParameters, newInstance);
1185 return(newInstance);
1186 }
1187 // Handle a simple argument-format error by propagating up to caller as usual.
1188 catch(final IllegalArgumentException e) { throw e; }
1189 // Log and propagate any other error.
1190 catch(final Exception e) { e.printStackTrace(); throw new Error(e); }
1191 }
1192
1193 }