001 /*
002 Copyright (c) 1996-2009, Damon Hart-Davis
003 All rights reserved.
004
005 Redistribution and use in source and binary forms, with or without
006 modification, are permitted provided that the following conditions are
007 met:
008
009 * Redistributions of source code must retain the above copyright
010 notice, this list of conditions and the following disclaimer.
011
012 * Redistributions in binary form must reproduce the above copyright
013 notice, this list of conditions and the following disclaimer in the
014 documentation and/or other materials provided with the
015 distribution.
016
017 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
018 IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
019 TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
020 PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
021 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
022 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
023 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
024 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
025 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
026 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
027 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
028 */
029
030 package org.hd.d.pg2k.ai.scorer;
031
032 import java.io.IOException;
033 import java.util.ArrayList;
034 import java.util.Collection;
035 import java.util.Collections;
036 import java.util.Comparator;
037 import java.util.HashMap;
038 import java.util.HashSet;
039 import java.util.Iterator;
040 import java.util.List;
041 import java.util.Map;
042 import java.util.NavigableSet;
043 import java.util.Set;
044 import java.util.SortedSet;
045 import java.util.TreeSet;
046 import java.util.concurrent.ConcurrentHashMap;
047 import java.util.concurrent.ConcurrentMap;
048 import java.util.concurrent.atomic.AtomicInteger;
049
050 import org.hd.d.pg2k.svrCore.GenUtils;
051 import org.hd.d.pg2k.svrCore.SimpleLoggerIF;
052 import org.hd.d.pg2k.svrCore.MemoryTools.SimpleLRUMapAutoSizeForHitRate;
053 import org.hd.d.pg2k.svrCore.Tuple.Pair;
054 import org.hd.d.pg2k.svrCore.datasource.SimpleExhibitPipelineIF;
055 import org.hd.d.pg2k.svrCore.vars.EventPeriod;
056 import org.hd.d.pg2k.svrCore.vars.SimpleVariableValue;
057 import org.hd.d.pg2k.svrCore.vars.SystemVariables;
058
059 import ORG.hd.d.IsDebug;
060
061
062 /**Thread-safe container for Scorer-and-parameter information to maintain an evolving population.
063 * This separates Scorers by basic (parameterless) name,
064 * on the grounds that they are not directly comparable.
065 * <p>
066 * This has limited capacity (so as to cap resource use),
067 * and discards the least good parameterised Scorers as needed
068 * to make way for new (better) instances.
069 * (These discarded less-good Scorers may have their results retained
070 * in a smalled secondary "negative" cache to speed up re-testing.)
071 * <p>
072 * This has the automatially persists/announces/shares new/good instances
073 * using the system variables events mechanism, local and global.
074 * <p>
075 * TODO: in case of accute memory shortage abandon least-good Scorers
076 */
077 public final class ScorerPopulation
078 {
079 /**Construct with a list of the base Scorers (names, no parameters) to keep track of.
080 * Note that we may slightly exceed the maximum specified population size for expediency,
081 * and we may dump some (less good) part of the population under memory stress.
082 *
083 * @param baseScorers non-null, non-empty, pure-base-name seed set
084 * @param maxPopulationSize maximum number of Scorer details to retain in memory,
085 * possibly excluding the fixed/base Scorers, strictly positive;
086 * this value is advisory and may be slightly exceeded for performance
087 * @param callback used to report new best-of-breed being inserted into cache;
088 * may be null
089 */
090 public ScorerPopulation(final Map<String, ScorerIF> baseScorers,
091 final NewBestCallbackIF callback,
092 final int maxPopulationSize)
093 {
094 this.callback = callback;
095
096 if(baseScorers == null) { throw new IllegalArgumentException(); }
097 // Store immutable (defensive) copy of base Scorer names/instances.
098 this.baseScorers = Collections.unmodifiableMap(new HashMap<String, ScorerIF>(baseScorers));
099 if(this.baseScorers.isEmpty()) { throw new IllegalArgumentException(); }
100 // Validate the Scorer names and instances.
101 for(final String n : this.baseScorers.keySet())
102 {
103 // Validate the name, and make sure that there are no parameters.
104 if(!AbstractScorer.parseNameAndParameters(n).second.isEmpty())
105 { throw new IllegalArgumentException("no parameters allowed on seed set"); }
106 // Ensure that the name corresponds to the actual Scorer instance supplied.
107 final ScorerIF scorer = baseScorers.get(n);
108 if((scorer == null) || !n.equals(scorer.getBaseName()))
109 { throw new IllegalArgumentException("mismatch between Scorer base name and instance"); }
110 if(!n.equals(scorer.getNameAndParameters()))
111 { throw new IllegalArgumentException("supplied Scorer instance was parameterised"); }
112 }
113 // Create the Set of valid classes for quick validation of instances.
114 supportedScorerClasses = new HashSet<Class<? extends ScorerIF>>(2 * this.baseScorers.size());
115 for(final ScorerIF sc : this.baseScorers.values())
116 { supportedScorerClasses.add(sc.getClass()); }
117
118 // Must be at least two slots per base Scorer type/name
119 // (to allow for the base/parameterless value and one 'optimal' value).
120 if(maxPopulationSize <= 2*this.baseScorers.size()) { throw new IllegalArgumentException("population too small"); }
121 this.maxPopulationSize = maxPopulationSize;
122
123 // Size to accommodate the specified capacity without further expansion.
124 // We don't expect much write concurrency.
125 // We use a lowish load-factor to enhance read and write performance.
126 _scorerScores = new ConcurrentHashMap<String, Pair<Long, ScoreAndConf>>(2*maxPopulationSize, 0.5f, 4);
127
128 // The comparator arranges for the items to be sorted by increasing "goodness".
129 // Because of a lack of symmetry in scoring
130 // (ie reversing the prediction of a bad Scorer does not necessarily make for a good prediction)
131 // we are most interested in the highest-confidence, positive-correlation Scorers.
132 SCORER_RANK_COMPARATOR = new Comparator<String>()
133 {
134 // This treats missing items from _scorerScores as neutral (goodness 0).
135 public final int compare(final String a, final String b)
136 {
137 final ScoreAndConf sacA = _scorerScores.get(a).second;
138 final ScoreAndConf sacB = _scorerScores.get(b).second;
139 // Primary sort by goodness.
140 final int goodnessA = (sacA == null) ? 0 : ScoreAndConf.computeScorerGoodness(sacA);
141 final int goodnessB = (sacB == null) ? 0 : ScoreAndConf.computeScorerGoodness(sacB);
142 if(goodnessA < goodnessB) { return(-1); }
143 if(goodnessA > goodnessB) { return(+1); }
144 if((sacA != null) && (sacB != null))
145 {
146 // Secondary sort by score (if both scores available).
147 if(sacA.score < sacB.score) { return(-1); }
148 if(sacA.score > sacB.score) { return(+1); }
149 }
150 // Break ties by name to ensure a total ordering.
151 return(a.compareTo(b));
152 }
153 };
154 // Create the (immutable) outer map with empty sub-maps for _scorerByGoodness.
155 final Map<String, NavigableSet<String>> sbg = new HashMap<String, NavigableSet<String>>(2 * baseScorers.size());
156 for(final String n : this.baseScorers.keySet())
157 { sbg.put(n, new TreeSet<String>(SCORER_RANK_COMPARATOR)); }
158 _scorerByGoodness = Collections.unmodifiableMap(sbg);
159
160 // Backup LRU cache, mainly for "bad" Scorer results.
161 // This also accommodates 'clones' evicted from the main population.
162 final int _backstopLRUMaxSize = 1 + maxPopulationSize/2;
163 _backstopLRU = SimpleLRUMapAutoSizeForHitRate.<String, Pair<Long, ScoreAndConf>>create(1+(_backstopLRUMaxSize/8), _backstopLRUMaxSize, "_backstopLRU");
164
165 // Create Scorer lookup cache, size limited, in proportion to max population size.
166 // This allows us to hold in cache every live Scorer in the main population.
167 // This assumes that each Scorer instance does not consume much memory/resource.
168 // Since we may well need to churn through the entire population sequentially,
169 // eg during scavenging or possibly while selecting a breeding set,
170 // this cache should be significantly larger than the population size to be of most value.
171 // TODO: should possibly be held via SoftReference to be dropped during extreme memory stress.
172 _gSI_cache = SimpleLRUMapAutoSizeForHitRate.<String, ScorerIF>create(1+(maxPopulationSize/8), 1 + ((3*maxPopulationSize)/2), "_gSI_cache");
173 }
174
175 /**Comparator used to rank Scorers relative to one another by increasing goodness, never null.
176 * Scorer values not currently represented in the population
177 * are treated as if goodness 0 (ie neutral).
178 * <p>
179 * If both goodnesses are available a secondary ranking is done
180 * by increasing score.
181 * <p>
182 * Complete ties are broken by Scorer name.
183 */
184 public final Comparator<String> SCORER_RANK_COMPARATOR;
185
186 /**If true then we may kick out apparent-near duplicates as the population reaches maximum.
187 * The aim of doing this is to increase diversity of the 'gene pool' for a given capacity,
188 * by making space for significantly different genotypes
189 * and avoiding them being swamped or pushed out by many near-clones of the overal best Scorer
190 * in any given sub-population.
191 * <p>
192 * This scavenging may have a random element.
193 * <p>
194 * Scavenging may be quite slow.
195 */
196 private static final boolean SCAVENGE_FOR_DIVERSITY = true;
197
198 /**Reset/clear the population to its initial (empty) state.
199 * Does not shut out all other activity,
200 * so it is possible that new items may be inserted while this is running.
201 */
202 public void clear()
203 {
204 // First clear the back-stop cache.
205 _backstopLRU.clear();
206 // Now systematically clear the main data stores and caches under the data lock.
207 synchronized(_lock)
208 {
209 _clearCaches();
210 _scorerScores.clear();
211 for(final NavigableSet<String> nm : _scorerByGoodness.values())
212 { nm.clear(); }
213 }
214 // Finally clear out the Scorer instance cache.
215 _gSI_cache.clear();
216 }
217
218 /**Construct with a list of the base Scorers (names, no parameters) to keep track of.
219 * Note that we may slightly exceed the maximum specified population size for expediency,
220 * and we may dump some (less good) part of the population under memory stress.
221 * <p>
222 * This automatically posts new best-of-breed Scorers to the system variables event store
223 * as they are cached.
224 *
225 * @param baseScorers non-null, non-empty, pure-base-name seed set
226 * @param maxPopulationSize maximum number of Scorer details to retain in memory,
227 * possibly excluding the fixed/base Scorers, strictly positive;
228 * this value is advisory and may be slightly exceeded for performance
229 * @param vars system variables pipeline to record new/best Scorer events to
230 */
231 public ScorerPopulation(final Map<String, ScorerIF> baseScorers,
232 final int maxPopulationSize,
233 final SimpleExhibitPipelineIF vars)
234 {
235 this(baseScorers,
236 (new NewBestCallbackIF() {
237 public final void reportNewBestScorer(final String nameAndParameters)
238 {
239 final SimpleVariableValue svvL = new SimpleVariableValue(SystemVariables.AI_SCORER_STRING_LOCAL_EVENT,
240 nameAndParameters);
241 try { vars.setVariable(svvL); }
242 catch(final IOException e) { e.printStackTrace(); /* Complain but continue after error. */ }
243 try
244 {
245 if(USE_GLOBAL_EVENTS_ALSO &&
246 "true".equalsIgnoreCase(vars.getGenProps(-1).getGen().get(GEN_PROPS_GLOBAL_POST_ENABLE_SUFFIX)))
247 {
248 final SimpleVariableValue svvG = new SimpleVariableValue(SystemVariables.AI_SCORER_STRING_GLOBAL_EVENT,
249 nameAndParameters);
250 try { vars.setVariable(svvG); }
251 catch(final IOException e) { e.printStackTrace(); /* Complain but continue after error. */ }
252 }
253 }
254 catch(final IOException e) { e.printStackTrace(); /* Complain but continue after error. */ }
255 }
256 }),
257 maxPopulationSize);
258 }
259
260 /**Maximum number of Scorer details to retain, possibly excluding the fixed/base Scorers, strictly positive.
261 * Must in fact be large than the number of base Scorer types,
262 * since we will need at least once slot for each non-parameterised type.
263 */
264 private final int maxPopulationSize;
265
266 /**Immutable Map from base Scorer names (no parameters) to base/parameterless instances to monitor/maintain; non-null, non-empty. */
267 private final Map<String, ScorerIF> baseScorers;
268
269 /**Set of class values for the supported Scorer types for fast validation; never null. */
270 private final Set<Class<? extends ScorerIF>> supportedScorerClasses;
271
272 /**Callback to report new best-of-breed being inserted into cache; may be null. */
273 private final NewBestCallbackIF callback;
274
275 /**A back-stop LRU cache of Scorer goodness, so that in particular the results of poor Scorers need not be excessively recomputed; never null.
276 * Normally we only keep the ScoreAndConf of the best Scorers,
277 * but if we happen to be repeatedly asked for the results of poor Scorers
278 * and we don't hold those results because they are not good to be in the main population
279 * then we may simply waste disproportionate time recomputing them.
280 * <p>
281 * This aims to be a bonus back-stop cache of all results not in the main cache,
282 * and is not strictly part of the counted population
283 * because these are not part of the potential "breeding set":
284 * they are here only for "negative cacheing" of poor Scorer parameters sets.
285 * <p>
286 * We keep it much smaller than the main population
287 * so that it is not a significant resource drain.
288 * <p>
289 * We push Scorers into it when they are displaced from the main population.
290 * <p>
291 * Thread-safe.
292 * <p>
293 * It is permissible and viable to hold a lock on this object over multiple operations
294 * to make them into one compound atomic operation.
295 */
296 private final SimpleLRUMapAutoSizeForHitRate<String, Pair<Long, ScoreAndConf>> _backstopLRU;
297
298 /**Non-user-visible lock to serialise activity on _scorerScores and _scorerByGoodness; never null.
299 * No blocking/external calls are made in the scope of this lock
300 * so as to ensure that it is held only for very short intervals.
301 */
302 private final Object _lock = new Object();
303
304 /**Thread-safe cache of Scorer scores (and time when cached), indirectly size-limited; never null.
305 * Map from scorer name-and-parameters to ScoreAndConfidence and the time computed/cached.
306 * <p>
307 * Single get() accesses to this can be made safely without further locking
308 * so as to maximise concurrency.
309 * <p>
310 * Note that since a put() operation does not lock out get() operations,
311 * any updates on the main _scorerScores map must be incrementally valid,
312 * ie incorrect data must never be inserted into the _scorerScores map.
313 */
314 private final ConcurrentMap<String, Pair<Long, ScoreAndConf>> _scorerScores;
315
316 /**Map from base Scorer name to a least-good-first NavigableSet of full nameAndParameters; never null.
317 * We pre-populate this with empty maps under each base name that we handle
318 * (so it is never necessary to test for presence in, or ever alter, this outer map; hence it is immutable).
319 * <p>
320 * The top-level Comparator is selected to sort by goodness derived from the _scorerScores,
321 * so any extant entry in the nameAndParameters should be removed from the _scorerByGoodness map
322 * before removal from or replacement in the _scorerScores map,
323 * and re-inserted afterwards as necessary.
324 * <p>
325 * Since the different base Scorer names/types are held separately,
326 * this helps keep the probably-distinct populations from competing with one another,
327 * though the overall population size cap is still present.
328 * <p>
329 * Because of a potential lack of symmetry in scoring
330 * (ie reversing the prediction of a bad Scorer does not necessarily make it a good one)
331 * we are interested in preserving the highest-confidence, positive-correlation Scorers.
332 * <p>
333 * Not thread-safe; access only under _lock.
334 */
335 private final Map<String, NavigableSet<String>> _scorerByGoodness;
336
337 /**ScoreAndConfidence for the given Scorer over all exhibit types; null if not known.
338 * Never recomputes, always fetches from its cache if present, else returns null.
339 * <p>
340 * Older results (typically up to 1 day older) may be returned if allowStale is true;
341 * this may help the system overall ride out temporary I/O problems and disconnections.
342 * <p>
343 * These get operations are generally fast,
344 * though some cycle-stealing incremental housekeeping might be performed on this call.
345 *
346 * @param scorer the Scorer instance; never null
347 * @param allowStale if true then allow an older cache response
348 * (up to VLONG period older)
349 *
350 * @return the score represents the correlation with the underlying votes
351 * (and whatever the scoring is measured against)
352 * with MAX meaning perfect correlation, 0 meaning no correlation,
353 * and -MAX meaning perfectly wrong answers all the time,
354 * and the confidence 0 if we have no (or very few) data points yet
355 * and approaching MAX as we have a large (enough) number of data points
356 *
357 * @throws IllegalArgumentException if the Scorer is null
358 */
359 public ScoreAndConf getScorerWeighting(final ScorerIF scorer,
360 final boolean allowStale)
361 {
362 if(scorer == null) { throw new IllegalArgumentException(); }
363 return(getScorerWeighting(scorer.getNameAndParameters(), allowStale));
364 }
365
366 /**ScoreAndConfidence for the given Scorer over all exhibit types; null if not known.
367 * Never recomputes, always fetches from its cache if present, else returns null.
368 * <p>
369 * Older results (typically up to 1 day older) may be returned if allowStale is true;
370 * this may help the system overall ride out temporary I/O problems and disconnections.
371 * <p>
372 * These get operations are generally fast,
373 * though some cycle-stealing incremental housekeeping might be performed on this call.
374 *
375 * @param scorerNameAndParameters the name and parameters of the Scorer; never null
376 * @param allowStale if true then allow an older cache response
377 * (up to VLONG period older)
378 *
379 * @return the score represents the correlation with the underlying votes
380 * (and whatever the scoring is measured against)
381 * with MAX meaning perfect correlation, 0 meaning no correlation,
382 * and -MAX meaning perfectly wrong answers all the time,
383 * and the confidence 0 if we have no (or very few) data points yet
384 * and approaching MAX as we have a large (enough) number of data points
385 */
386 public ScoreAndConf getScorerWeighting(final String scorerNameAndParameters,
387 final boolean allowStale)
388 {
389 if(scorerNameAndParameters == null) { throw new IllegalArgumentException(); }
390
391 final long currentPeriod = EventPeriod.VLONG.getIntervalNumber(System.currentTimeMillis());
392
393 // We don't need to validate the requested name (other than as being non-null)
394 // since an invalid value will correctly return null.
395 // We don't need to take any locks for this single get();
396 // this helps maximise concurrency and can proceed
397 // even when a long/slow put() operation is taking place in another thread.
398 final Pair<Long, ScoreAndConf> cachedValue = _scorerScores.get(scorerNameAndParameters);
399 // We can use a value from cache if the cached value is from the current period
400 // (or the previous period if allowStale is true).
401 if((cachedValue != null) &&
402 (EventPeriod.VLONG.getIntervalNumber(cachedValue.first) >= (allowStale ? currentPeriod-1 : currentPeriod)))
403 { return(cachedValue.second); }
404
405 // Try back-stop LRU cache.
406 // We access this out of the scope of the main lock to maximise concurrency.
407 // Note that we allow use of older values than from the main population.
408 // We actively remove expired items to make more room in this back-stop cache.
409 synchronized(_backstopLRU)
410 {
411 final Pair<Long, ScoreAndConf> backstopResult = _backstopLRU.get(scorerNameAndParameters);
412 if(backstopResult != null)
413 {
414 final long oldestRetained = currentPeriod-2;
415 final long cachedItemAge = EventPeriod.VLONG.getIntervalNumber(backstopResult.first);
416 if(cachedItemAge < oldestRetained)
417 {
418 // Remove expired value to make room for others.
419 _backstopLRU.remove(scorerNameAndParameters);
420 }
421 else if(cachedItemAge >= (allowStale ? oldestRetained : currentPeriod))
422 {
423 //if(IsDebug.isDebug) { System.out.println("[ScorerPopulation.getScorerWeighting(): caught lookup with back-stop LRU cache.]"); }
424 return(backstopResult.second);
425 }
426 }
427 }
428
429 // Nothing in either cache...
430 return(null);
431 }
432
433 /**GenProps 'generic' parameter that must be set for us to post to the global event store. */
434 public static final String GEN_PROPS_GLOBAL_POST_ENABLE_SUFFIX = "ai.scorerGlobalPostEnable";
435
436 /**Save the ScoreAndConfidence for the given Scorer over all exhibit types.
437 * Saves the given Scorer and "goodness",
438 * if necessary evicting a poorly-performing Scorer to make room
439 * ie to limit the population size.
440 * <p>
441 * If the "put" Scorer is very good compared to the existing population members
442 * then it may get persisted/shared via the event mechanism.
443 * <p>
444 * This routine will only consider storing entries for Scorers whose base name
445 * is on the list we were constructed with.
446 * <p>
447 * Note that a put() operation does not lock out get() operations,
448 * so any updates on the main _scorerScores map must be incrementally valid,
449 * ie incorrect data must never be inserted into the _scorerScores map.
450 * <p>
451 * This will attempt to cache the actual Scorer instance provided
452 * which may save time reconstructing it later, and memory by better sharing.
453 * This instance may be recovered with getScorerInstance() later,
454 * though may have to be recreated (ie returning the same instance is never guaranteed).
455 *
456 * @param scorer the Scorer instance; never null
457 * @param sac rating of this Scorer across all exhibits; never null
458 * @param dataSource to record/persist/share good Scorers through;
459 * null if no Scorer persistence is required
460 * @param log logger; never null
461 *
462 * @return true if the Scorer result is new for its sub-population and best in it
463 * and has a positive overall "goodness" (ie has some positive predictive ability)
464 *
465 * @throws IllegalArgumentException if the Scorer is null or of an unsupported type
466 */
467 public boolean putScorerWeighting(final ScorerIF scorer,
468 final ScoreAndConf sac,
469 final SimpleExhibitPipelineIF dataSource,
470 final SimpleLoggerIF log)
471 {
472 if(scorer == null) { throw new IllegalArgumentException(); }
473 if(!supportedScorerClasses.contains(scorer.getClass())) { throw new IllegalArgumentException("unsupported Scorer class"); }
474 final String nameAndParameters = scorer.getNameAndParameters();
475 _gSI_cache.put(nameAndParameters, scorer);
476 return(putScorerWeighting(nameAndParameters, sac, dataSource, log));
477 }
478
479 /**Save the ScoreAndConfidence for the given Scorer over all exhibit types.
480 * Saves the given Scorer and "goodness",
481 * if necessary evicting a poorly-performing Scorer to make room
482 * ie to limit the population size.
483 * <p>
484 * If the "put" Scorer is very good compared to the existing population members
485 * then it may get persisted/shared via the event mechanism.
486 * <p>
487 * This routine will only consider storing entries for Scorers whose base name
488 * is on the list we were constructed with.
489 * <p>
490 * Note that a put() operation does not lock out get() operations,
491 * so any updates on the main _scorerScores map must be incrementally valid,
492 * ie incorrect data must never be inserted into the _scorerScores map.
493 *
494 * @param nameAndParameters the name and parameters of the scorer; never null
495 * @param sac rating of this Scorer across all exhibits; never null
496 * @param dataSource to record/persist/share good Scorers through;
497 * null if no Scorer persistence is required
498 * @param log logger; never null
499 *
500 * @return true if the Scorer result is new for its sub-population and best in it
501 * and has a positive overall "goodness" (ie has some positive predictive ability)
502 */
503 public boolean putScorerWeighting(final String nameAndParameters,
504 final ScoreAndConf sac,
505 final SimpleExhibitPipelineIF dataSource,
506 final SimpleLoggerIF log)
507 {
508 if(sac == null) { throw new IllegalArgumentException(); }
509 if(nameAndParameters == null) { throw new IllegalArgumentException(); }
510
511 final Pair<String, Map<String, String>> nap = AbstractScorer.parseNameAndParameters(nameAndParameters);
512 final String baseName = nap.first;
513 // Ignore if not one of our recognised base Scorers.
514 if(!baseScorers.keySet().contains(baseName)) { return(false); }
515 final int goodness = ScoreAndConf.computeScorerGoodness(sac);
516 final boolean parameterless = nap.second.isEmpty();
517
518 final long startTime = System.currentTimeMillis();
519
520 // Set true if we discover that this is the best Scorer of its type so far
521 // so we should save/share it.
522 boolean isNewAndBest = false;
523
524 // Still a candidate for storing, so grab a lock...
525 synchronized(_lock)
526 {
527 // Clear caches within the lock scope to avoid/minimise races.
528 _clearCaches();
529
530 final SortedSet<String> insertedScorerSubPopulation = _scorerByGoodness.get(baseName);
531 // Compute previous 'best' score in this sub-population if any.
532 final int prevBest = insertedScorerSubPopulation.isEmpty() ? Integer.MIN_VALUE :
533 ScoreAndConf.computeScorerGoodness(_scorerScores.get(insertedScorerSubPopulation.last()).second);
534
535 // Remove and return any extant value for this Scorer value, null if none previously.
536 final Pair<Long, ScoreAndConf> oldVal = _remove(nameAndParameters);
537
538 // Insert the value: first in _scorerScores, then in the dependent _scorerByGoodness.
539 // We DO NOT intern() the possibly-long name because doing so would stress the PermGen.
540 _scorerScores.put(nameAndParameters, new Pair<Long, ScoreAndConf>(startTime, sac));
541 insertedScorerSubPopulation.add(nameAndParameters);
542
543 // Test if this is the best Scorer in its sub-population
544 // and has a non-zero goodness.
545 // Note that this has already been inserted into that sub-population,
546 // so cannot be "better",
547 // so we test against the previous 'best' value if any.
548 assert(goodness <= ScoreAndConf.computeScorerGoodness(_scorerScores.get(insertedScorerSubPopulation.last()).second));
549 if((goodness > 0) && (goodness > prevBest) && (oldVal == null))
550 { isNewAndBest = true; }
551
552 // If the population is oversize following this insertion
553 // then we'll cull the worst Scorer from the largest sub-population if possible.
554 if(_scorerScores.size() > maxPopulationSize)
555 {
556 // Find the largest sub-population.
557 int largestSubPopulation = -1;
558 String largestSubPopulationBasename = null;
559 for(final String subPopBaseName : _scorerByGoodness.keySet())
560 {
561 final int subPopSize = _scorerByGoodness.get(subPopBaseName).size();
562 if(subPopSize > largestSubPopulation)
563 {
564 largestSubPopulation = subPopSize;
565 largestSubPopulationBasename = subPopBaseName;
566 }
567 }
568 assert(largestSubPopulation > 0);
569 assert(largestSubPopulationBasename != null);
570
571 // Don't attempt scavenging all the time, since it may be very expensive, ie O(n).
572 // We try to reduce the average time scavenging to about O(1) on average.
573 // Scavenging is likely to work more effectively per-call if called infrequently.
574 if(SCAVENGE_FOR_DIVERSITY)
575 {
576 // Attempt one scavenging (clone-stripping) run on this (largest) sub-population.
577 _scavenge(largestSubPopulationBasename);
578 }
579
580 // If scavenging failed to recover any space then evict the least-good Scorer.
581 if(_scorerScores.size() > maxPopulationSize)
582 {
583 // ZAP LEAST-GOOD SCORER SO AS TO CAP THE POPULATION SIZE IF (STILL) OVERSIZE...
584 // Iterate through the largest population
585 // culling the first (least good) parameterised Scorer(s) found, if any.
586 // We may not be able to remove anything, leaving the population slightly oversized.
587 final SortedSet<String> victimSubPop = _scorerByGoodness.get(largestSubPopulationBasename);
588 for(final String putativeVictim : victimSubPop)
589 {
590 // We never remove the parameterless base Scorer from its sub-population.
591 if(putativeVictim.indexOf(AbstractScorer.SEPARATOR) == -1) { continue; }
592 // We push this about-to-be-zapped victim into the back-stop LRU cache...
593 _backstopLRU.put(putativeVictim, _scorerScores.get(putativeVictim));
594 // Now remove the victim from the main data stores.
595 final Pair<Long, ScoreAndConf> zapped = _remove(putativeVictim);
596 assert(zapped != null); // Must have actually removed the specified 'poor' Scorer.
597 // Having trimmed the population (and altered the collections) we must now stop.
598 break;
599 }
600 }
601 }
602 }
603
604 // If this a new/best +ve-goodness Scorer (in its sub-population) then post it locally
605 // if not or parameterless, and if we have a live dataSource.
606 // This is done outside of the lock scope
607 // to avoid blocking other get/put operations during I/O.
608 // We absorb (and report) any I/O error, but continue.
609 if(isNewAndBest && !parameterless && (callback != null))
610 {
611 if(IsDebug.isDebug) { log.log("[ScorerPopulation: new best Scorer: "+nameAndParameters+" "+sac+", 'goodness'="+goodness+"...]"); }
612 callback.reportNewBestScorer(nameAndParameters);
613 }
614
615 return(isNewAndBest);
616 }
617
618 /**Remove and return any extant value for this Scorer value, null if none previously.
619 * Must only be called under the data lock to be atomic wrt the data stores.
620 *
621 * @param nameAndParameters non-null valid Scorer value incluing optional parameters
622 *
623 * @return previous timestamp/value for this Scorer, else null if nne
624 */
625 private synchronized Pair<Long, ScoreAndConf> _remove(final String nameAndParameters)
626 {
627 assert(Thread.holdsLock(_lock)); // This lock must be held for safety.
628
629 // See if we have an existing entry for this nameAndParameters.
630 // (If not, there is nothing to remove.)
631 final Pair<Long, ScoreAndConf> oldVal = _scorerScores.get(nameAndParameters);
632 if(oldVal == null) { return(null); }
633 // If we do, then remove the corresponding entry from _scorerByGoodness first
634 // else we will break its comparator/lookup which depends on _scorerScores.
635 // Quickly extract the base name of the Scorer to select the sub-population.
636 final int firstSep = nameAndParameters.indexOf(AbstractScorer.SEPARATOR);
637 final String baseName = (firstSep == -1) ? nameAndParameters : (nameAndParameters.substring(0, firstSep));
638 // Remove from the appropriate sub-population first...
639 final SortedSet<String> subPopulation = _scorerByGoodness.get(baseName);
640 assert (subPopulation != null);
641 assert (subPopulation.contains(nameAndParameters));
642 subPopulation.remove(nameAndParameters);
643 assert (!subPopulation.contains(nameAndParameters));
644 // Then remove from the main mapping, ie _scorerScores.
645 _scorerScores.remove(nameAndParameters);
646 return(oldVal);
647 }
648
649 /**Countdown to next scavenge: when zero then scavenge and add on current population size. */
650 private final AtomicInteger countdownToNextScavenge = new AtomicInteger();
651
652 /**Minimum 'delta' in score for two Scorers to be considered potentially similar; non-negative.
653 * This is the minimum difference in the raw score, not weighted by confidence,
654 * for two Scorers on similar overall goodness to be considered 'similar',
655 * given that we will then further filter by genotype and/or phenotype,
656 * the assumption being that differences of this scale may just be (rounding and other) noise.
657 * <p>
658 * A good value for this is probably less than 1% of maximum,
659 * even though many Scorers will struggle to be much better than chance than this.
660 * <p>
661 * Note that we may apply a tighter test to values very close to zero.
662 * <p>
663 * A value of zero indicates that scores have to be identical
664 * to be considered similar by this measure.
665 */
666 private static final int SIMILAR_SCORE_DELTA = Math.max(2, ScoreAndConf.MAX_GOODNESS >>> 7);
667
668 /**If true then if a minor scavenge fails always force a major scavenge.
669 * This may be expensive but more-or-less guarantees
670 * that there will always be some space in the population
671 * for marginal but unusual Scorers to have a chance to be bred from.
672 */
673 private static final boolean FORCE_MAJOR_COLLECTION_AFTER_MINOR_FAILS = true;
674
675 /**Attempt to strip out near-clones from the population to make space for more diversity.
676 * This works on the assumption that there may be many valuable Scorer values
677 * that while not optimal across all exhibits or at this very moment,
678 * may be valuable on sub-sets of the exhibits now or in the near future.
679 * This routine attempts to prevent limited population space (gene-pool capacity)
680 * from being overrun with very slight variations on (effective clones of) a monoculture.
681 * <p>
682 * Currently this is done by elimination of genotypic near-clones
683 * by eliminating those Scorers that have very similar Scores and no sufficiently-distinct genes.
684 * <p>
685 * TODO: In future this may be done by elimination of phenotypic near clones
686 * by eliminating those Scorers that are have the same predction performance over a given calbration set.
687 * <p>
688 * This may not trim out anything from the specified sub-population.
689 * <p>
690 * This will never remove the 'best' item nor the base/parameterless Scorer.
691 * <p>
692 * Values removed by scavenging may be pushed into the back-stop LRU cache
693 * in case they are used again soon by the outside world;
694 * thus we may limit the amount of any scavengine we do in one go
695 * so as to avoid overflowing this back-stop cache.
696 * <p>
697 * This tries a series of more drastic pruning measures until it zaps some/enough Scorers,
698 * with the inital pruning passes being very quick and gentle.
699 * <p>
700 * This does not guarantee to remove any Scorers at all.
701 * <p>
702 * This must be called under the data lock.
703 * <p>
704 * Run time is expected to average about O(1) (peak O(n)) for a size-n sub-population.
705 *
706 * @param largestSubPopulationBasename
707 */
708 private void _scavenge(final String largestSubPopulationBasename)
709 {
710 assert((largestSubPopulationBasename != null) && (_scorerByGoodness.get(largestSubPopulationBasename) != null));
711 assert(Thread.holdsLock(_lock)); // This lock must be held for safety.
712
713 final long startTime = System.currentTimeMillis();
714 final long currentTick = EventPeriod.VLONG.getIntervalNumber(startTime);
715
716 // Compute the maximum number of Scorers to attempt to evict in this run.
717 final NavigableSet<String> subPop = _scorerByGoodness.get(largestSubPopulationBasename);
718
719 // Record the size of the sub-population when we start the scavenge.
720 final int startSubPopSize = subPop.size();
721
722 // MINOR scavenge limits.
723 // Cap this to a very slow-growing fraction of the sub-population size
724 // so as to keep the mean run-time very close to O(1).
725 final int maxMinorPurgeCount = Math.min(2+GenUtils.log2Approx(startSubPopSize), startSubPopSize-2);
726 // Return immediately if not big enough for us to do even a decent minor scavenge from.
727 if(maxMinorPurgeCount < 2) { return; }
728
729 //if(IsDebug.isDebug) { System.out.println("[ScorerPopulation: scavenging... Sub-population size = "+(startSubPopSize)+".]"); }
730
731 // The set of Scorers that we wish to evict...
732 final Set<String> evictees = new HashSet<String>(16 + 2*maxMinorPurgeCount);
733
734 // Do MINOR purge...
735 // Remove anything from the extreme tail end (least goodness) part of the sub-population:
736 // * Without merit (score < 0).
737 // * Very similar to any item better than it in this tail.
738 //
739 // We copy this always-very-small tail into a new List, worst-first.
740 final List<String> tail = new ArrayList<String>(maxMinorPurgeCount);
741 final Iterator<String> tailIt = subPop.iterator();
742 for(int i = maxMinorPurgeCount; --i >= 0; ) { tail.add(tailIt.next()); }
743 assert(tail.size() == maxMinorPurgeCount);
744 assert(tail.size() < startSubPopSize); // Ensure that we can never zap the 'best' Scorer by mistake.
745 for(int i = 0; i < maxMinorPurgeCount; ++i)
746 {
747 final String n = tail.get(i);
748 // Never try and remove the base/parameterless Scorer from this sub-population.
749 if(n.indexOf(AbstractScorer.SEPARATOR) == -1) { continue; }
750 final ScoreAndConf sac = _scorerScores.get(n).second;
751 // Zap anything without a positive score/confidence.
752 if((sac.score <= 0) || (sac.confidence == 0))
753 {
754 evictees.add(n);
755 _backstopLRU.put(n, _scorerScores.get(n)); // Push this into the back-stop list.
756 continue;
757 }
758 // Zap anything very at all to any Scorer better than it in the tail.
759 final ScorerIF sc = getScorerInstance(n);
760 assert(n != null);
761 for(int j = i+1; j < maxMinorPurgeCount; ++j)
762 {
763 // We need to be at least a little different to everything else better than us
764 // if we are to be retained.
765 if(AbstractScorer.verySimilar(sc, getScorerInstance(tail.get(j))))
766 {
767 assert(!n.equals(tail.get(j))); // All Scorers in the tail must be different...
768 evictees.add(n);
769 _backstopLRU.put(n, _scorerScores.get(n)); // Push this into the back-stop list.
770 break;
771 }
772 }
773 }
774
775 // If the minor scavenge identified any poor Scorer
776 // then zap them and avoid a major/slow scavenge if more than one was found.
777 if(!evictees.isEmpty())
778 {
779 // Must remove from sub-population before removing from _scorerScores.
780 subPop.removeAll(evictees);
781 _scorerScores.keySet().removeAll(evictees);
782
783 // We can avoid a major collection if we did better than simply trimming one entry
784 // which the quick-and-easy default non-scavenge mechanism will do anyway.
785 if(evictees.size() > 1)
786 {
787 //if(IsDebug.isDebug) { System.out.println(" [ScorerPopulation: minor scavenge successful "+evictees.size()+'/'+startSubPopSize+" Scorers.]"); }
788 return;
789 }
790
791 evictees.clear(); // Don't need to carry these over to the major collection.
792 // Fall through to consider doing a major scavenge...
793 }
794
795
796 // MAJOR scavenge limits.
797 // Cap this to a small fraction of the sub-population size so as not to extinguish too much at once.
798 // Cap this also to a fraction of the LRU backstop cache size so as not to overwhelm/flush it.
799 final int maxPurgeCount = Math.min(subPop.size()/8, _backstopLRU.size()/2);
800 // Return immediately if not big enough for us to scavenge from...
801 if(maxPurgeCount < 1) { return; }
802
803 // After failure of a minor scavenge to make much/any space
804 // we may wish to fall back to our default kill-the-worst-one tactic,
805 // or we may try to force a major collection likely to reclaim a decent chunk immediately.
806 if(!FORCE_MAJOR_COLLECTION_AFTER_MINOR_FAILS)
807 {
808 // Most of the time, quit before slow/major scavenges to keep the mean execution time low.
809 // We only return on > 0 so that this will fail and fall through the first time.
810 if(countdownToNextScavenge.getAndDecrement() > 0) { return; }
811 // Postpone the next major scavenge enough to keep its contribution ~ O(1).
812 countdownToNextScavenge.addAndGet(1+maxPurgeCount);
813 }
814
815 //if(IsDebug.isDebug) { System.out.println("[ScorerPopulation: scavenging (MAJOR)... Sub-population size = "+(subPop.size())+".]"); }
816
817 // Work through the Scorers from worst to best,
818 // stopping when we've found enough candidates.
819 // We examine the Scorers in pairs,
820 // deleiberately leaving at least alternate values untouched on any one pass.
821 for(final Iterator<String> it = subPop.iterator(); it.hasNext() && (evictees.size() < maxPurgeCount); )
822 {
823 // Get 'less good' of the two Scorers that we will examine.
824 final String worse = it.next();
825
826 // Never try and scavenge/remove the best (last) Scorer in this sub-population...
827 if(!it.hasNext()) { break; }
828 // Never try and remove the base/parameterless Scorer from this sub-population.
829 if(worse.indexOf(AbstractScorer.SEPARATOR) == -1) { continue; }
830
831 // Get the 'better' of the two Scorers compared this time.
832 assert(it.hasNext()); // We know from earlier test that there is at least one more Scorer.
833 final String better = it.next();
834
835 // If the worse of the pair is sufficiently old that we wouldn't return it even with allowStale
836 // then dump it immediately (don't even push it into the back-stop cache).
837 // This doesn't have to exactly match the cache behaviour (though doing so is good),
838 // but simply allows us to kick out very old Scorers that have not been re-evaluated at all recently
839 // so may be dead wood consuming valuable space.
840 final Pair<Long, ScoreAndConf> pairW = _scorerScores.get(worse);
841 final long tickWhenCached = EventPeriod.VLONG.getIntervalNumber(pairW.first);
842 if(currentTick - tickWhenCached > 1)
843 {
844 evictees.add(worse);
845 //if(IsDebug.isDebug) { System.out.println(" [ScorerPopulation: scavenged old Scorer: " + _scorerScores.get(worse) + ".]"); }
846 continue;
847 }
848
849 // Only proceed if the scores of the two Scorers are reasonably similar.
850 final ScoreAndConf sacW = pairW.second;
851 final ScoreAndConf sacB = _scorerScores.get(better).second;
852 // For scores very close to zero we only treat identical scores as similar...
853 if((SIMILAR_SCORE_DELTA == 0) || (Math.abs(sacB.score) < (SIMILAR_SCORE_DELTA<<4)))
854 { if(sacW.score == sacB.score) { continue; } }
855 else if(Math.abs(sacW.score - sacB.score) > SIMILAR_SCORE_DELTA) { continue; }
856
857 // Check for genotypic similarity...
858 // Don't discard a 'worse' Scorer sufficiently different to its 'better'.
859 if(!AbstractScorer.verySimilar(getScorerInstance(worse), getScorerInstance(better))) { continue; }
860
861 // Mark this Scorer for removal (and push into the back-stop cache immediately).
862 evictees.add(worse);
863 _backstopLRU.put(worse, _scorerScores.get(worse));
864 //if(IsDebug.isDebug) { System.out.println(" [ScorerPopulation: scavenged 'clone' Scorer: " + _scorerScores.get(worse) + ".]"); }
865 }
866
867 // If we didn't manage to evict more than one Scorer with out subtle weeding
868 // (which the quick-and-easy default non-scavenge mechanism would do anyway)
869 // then run an increasingly harsh filter until we fill our quota
870 // so as to pay for the time spent scavenging so far.
871 // We still will never evict the best Scorer,
872 // nor more than a small subset of all Scorers,
873 // and we still start evicting the poorest Scorers overall,
874 // so at worst this is probably similar to having a somewhat smaller maximum population size.
875 if(evictees.size() <= 1)
876 {
877 // Require adjacent Scorers by 'goodness' to be less and less similar to survive each pass
878 // until we manage to force enough Scorers out to give lots of elbow room for new entrants
879 // or all remaining Scorers really seem (genotypically) different
880 // (or we are effectively just forcing Scorers out by lottery).
881 final int nParams = getScorerInstance(subPop.first()).getParameterDefsAndValues().size();
882 // Start with small steps, ie using the finest razor available,
883 // but limit the maximum number of passes even for large parameter sets,
884 // to very approximately O(log(nParams)) unless we make good progress.
885 // We double the step if we did not find new evictees on the previous pass,
886 // else if we did clear some then we set the step back to 1 for maximum resolution.
887 boolean clearedOnLastPass; // True if we cleared anything on the last pass.
888 int step = 1; // Current similarity (decrease) step; strictly positive.
889 for(int similarity = nParams; (similarity > 0) && (evictees.size() < maxPurgeCount); similarity -= step, step = (clearedOnLastPass ? 1 : (2*step)))
890 {
891 //if(IsDebug.isDebug) { System.out.println("_scavenge(): nParams="+nParams+", similarity="+similarity+", step="+step+", evictees.size()="+evictees.size()); }
892 clearedOnLastPass = false;
893 for(final Iterator<String> it = subPop.iterator(); it.hasNext() && (evictees.size() < maxPurgeCount); )
894 {
895 // Get 'less good' of the two Scorers that we will examine.
896 final String worse = it.next();
897
898 // Never try and scavenge/remove the best (last) Scorer in this sub-population...
899 if(!it.hasNext()) { break; }
900 // Never try and remove the base/parameterless Scorer from this sub-population.
901 if(worse.indexOf(AbstractScorer.SEPARATOR) == -1) { continue; }
902 // Skip any Scorer already waiting to be purged...
903 if(evictees.contains(worse)) { continue; }
904
905 // Get the 'better' of the two Scorers compared this time.
906 assert(it.hasNext()); // We know from earlier test that there is at least one more Scorer.
907 final String better = it.next();
908 // Skip any Scorer already waiting to be purged...
909 if(evictees.contains(better)) { continue; }
910
911 // Check for genotypic similarity...
912 // Don't discard a 'worse' Scorer sufficiently different to its 'better'.
913 if(!AbstractScorer.similarNParams(getScorerInstance(worse), getScorerInstance(better), similarity)) { continue; }
914
915 // Mark this Scorer for removal (and push into the back-stop cache immediately).
916 evictees.add(worse);
917 _backstopLRU.put(worse, _scorerScores.get(worse));
918 clearedOnLastPass = true;
919 //if(IsDebug.isDebug) { System.out.println(" [ScorerPopulation: scavenged near ("+similarity+") 'clone' Scorer: " + _scorerScores.get(worse) + ".]"); }
920 }
921 }
922 }
923
924 // Now efficiently purge all the evictees (if any) en masse.
925 if(!evictees.isEmpty())
926 {
927 //if(IsDebug.isDebug) { System.out.println(" [ScorerPopulation: scavenged "+evictees.size()+'/'+_scorerScores.size()+" Scorers at "+((System.currentTimeMillis()-startTime)/evictees.size())+"ms/Scorer.]"); }
928 // Must remove from sub-population before removing from _scorerScores.
929 subPop.removeAll(evictees);
930 _scorerScores.keySet().removeAll(evictees);
931 }
932 }
933
934 /**Clear any caches which depend on the population state remaining unchanged.
935 * This clearing is done within the lock that protects the internal data structures
936 * (even though the caches can be safely accessed outside the lock)
937 * to minimise unwanted races (and eliminate them for lock-protected cache accesses).
938 */
939 private void _clearCaches()
940 {
941 synchronized(_lock)
942 {
943 // Clear any caches which may be otherwise invalid after an update.
944 _gCSWP_cache = null; // Will need to recompute the "best" Scorer set.
945 _gBS_cache.clear(); // Will need to recompute any breeding set.
946 }
947 }
948
949 /**If true, allow posting the global event for exchange of the best/new Scorers.
950 * The mechanism must also be enabled in the GenProps for this to happen.
951 */
952 private static final boolean USE_GLOBAL_EVENTS_ALSO = true;
953
954 /**Get current population size; non-negative.
955 * This value may slightly exceed the requested maximum.
956 * <p>
957 * This does not include the count of any entries in the back-stop LRU (negative) cache.
958 *
959 * @return count of unique Scorers that might be selected as part of a "breeding set"
960 */
961 public int size() { return(_scorerScores.size()); }
962
963 /**Return the target maximum population size; non-negative.
964 * The actual population size may exceed this somewhat; this is a target.
965 */
966 public int maxSize() { return(maxPopulationSize); }
967
968 /**Cache of result from getCurrentScorersWithParameters(); may be null if nothing cached.
969 * Initially null, and cleared on any update (put) to the population.
970 * <p>
971 * Any value cached here must be immutable for safety and sanity.
972 * <p>
973 * Marked volatile for thread-safe lock-free access.
974 */
975 private volatile Set<String> _gCSWP_cache;
976
977 /**Current immutable set of available Scorer names and parameters (where applicable); never null but may be empty.
978 * The values returned are of the form <i>ScorerName{:name=value}*</i>.
979 * <p>
980 * The Scorers returned by this will generally be the best available,
981 * suitable for making a real prediction of and exhibit score,
982 * and will usually be the best one or handful per base-Scorer type,
983 * allowing that different Scorer types may have different "skill" domains,
984 * and excluding any with non-positive goodness.
985 * <p>
986 * This routine may be expensive, though will try to be efficient.
987 */
988 public Set<String> getCurrentScorersWithParameters(final boolean allowStale)
989 {
990 // Check the cache first.
991 // The cache is cleared by any put/update of the population.
992 final Set<String> cachedValue = _gCSWP_cache;
993 if(_gCSWP_cache != null) { return(cachedValue); }
994
995 // Result set, of only +ve goodness Scorers.
996 final Set<String> r = new HashSet<String>(4 * baseScorers.size());
997
998 // Hold a lock on the data structures while working.
999 synchronized(_lock)
1000 {
1001 // Attempt to a candidate from each base type (sub population).
1002 for(final NavigableSet<String> subPop : _scorerByGoodness.values())
1003 {
1004 if(!subPop.isEmpty())
1005 {
1006 // Have the number of samples from each base name/type to include >= 1
1007 // (zero iff there are no eligible candidates at all)
1008 // and growing only (very) slowly with the population size.
1009 // In any case we try to return a diverse group from each sub-population.
1010 // We use log(log(n)) to allow for the possibility of multiple distinct useful candidates
1011 // in very large populations, without wasting too much time on many near clones.
1012 final int maxToInclude = Math.max(1, GenUtils.log2Approx(Math.max(1, GenUtils.log2Approx(subPop.size()))));
1013
1014 // Get the best Scorer of this type and its "goodness".
1015 final String best = subPop.last();
1016 final int bGoodness = ScoreAndConf.computeScorerGoodness(_scorerScores.get(best).second);
1017 if(bGoodness <= 0) { continue; /* All bad! */ }
1018 // We'll accept other Scorers nearly as good as the best,
1019 // but NOT of identical goodness or otherwise too similar (pheno/genotypically)
1020 // as we want (useful) variety in what we return, eg for multimodal matching.
1021 final int threshold = bGoodness/2;
1022 // Set of goodnesses so far: we don't return two identical.
1023 final Set<Integer> goodnesses = new HashSet<Integer>(2 * maxToInclude);
1024 // Set of scores so far: we don't return two identical.
1025 final Set<Short> scores = new HashSet<Short>(2 * maxToInclude);
1026 // Set of Scorer instances so far; we don't return two 'very similar'.
1027 final Collection<ScorerIF> instances = new ArrayList<ScorerIF>(maxToInclude);
1028 // Work backwards from the best (which is always included)...
1029 int remaining = maxToInclude;
1030 main: for(final String snp : subPop.descendingSet())
1031 {
1032 final ScoreAndConf sac = _scorerScores.get(snp).second;
1033 final Short score = sac.score;
1034 if(scores.contains(score)) { continue; /* Identical to a previous score. */ }
1035 scores.add(sac.score);
1036 final int g = ScoreAndConf.computeScorerGoodness(sac);
1037 // Stop if the goodness falls too far (or to/below zero).
1038 if(g <= threshold) { break; }
1039 assert(g > 0); // Only keeping Scorers with merit.
1040 final Integer G = g;
1041 if(goodnesses.contains(G)) { continue; /* Identical to a previous goodness. */ }
1042 goodnesses.add(G);
1043 // Make sure that this Scorer is not of very similar genotype to a previous one.
1044 final ScorerIF sc = getScorerInstance(snp);
1045 assert(sc != null);
1046 // We require increasingly-different Scorer instances as we gather more Scorers.
1047 final int maxSimilarity = Math.max(0, sc.getParameterDefsAndValues().size() - instances.size());
1048 for(final ScorerIF prevSc : instances)
1049 { if(AbstractScorer.similarNParams(prevSc, sc, maxSimilarity)) { continue main; /* Similar genotype to extant returnees. */ } }
1050 // Record new genetically-different Scorer.
1051 instances.add(sc);
1052 // This Scorer is good enough to return.
1053 r.add(snp);
1054 // Stop if we've collected enough.
1055 if(--remaining <= 0) { break; }
1056 }
1057 assert(r.contains(best));
1058 }
1059 }
1060 }
1061
1062 // We avoid cacheing empty results to assist with fast bootstrapping.
1063 if(r.isEmpty()) { return(Collections.emptySet()); }
1064 // Wrap the result to make it immutable, and cache it (unless empty).
1065 final Set<String> result = Collections.unmodifiableSet(r);
1066 _gCSWP_cache = result;
1067
1068 return(result);
1069 }
1070
1071 /**Cache of results for getBreedingSet(); never null.
1072 * Cleared by any put/update operation on the population.
1073 * <p>
1074 * Thread-safe map from (valid) baseName to breeding set.
1075 */
1076 private final ConcurrentMap<String, List<String>> _gBS_cache = new ConcurrentHashMap<String, List<String>>();
1077
1078 /**Get immutable random-access list of Scorers to "breed" from, best first; never null, possibly empty.
1079 * This returns all of (or some of the best of) the Scorers with the given base name.
1080 * <p>
1081 * This generally trims away all Scorers with non-positive values,
1082 * but will always try to return the few best items regardless,
1083 * ie the result is not completely empty unless the specified sub-population is.
1084 * <p>
1085 * The result will generally be large if the population is.
1086 */
1087 public List<String> getBreedingSet(final String baseName)
1088 {
1089 // Validate the base name first.
1090 if(baseName == null) { throw new IllegalArgumentException(); }
1091 if(!baseScorers.keySet().contains(baseName)) { throw new IllegalArgumentException(); }
1092
1093 final List<String> cachedValue = _gBS_cache.get(baseName);
1094 if(cachedValue != null) { return(cachedValue); }
1095
1096 // Hold lock while accessing mutable data.
1097 synchronized(_lock)
1098 {
1099 // Get in "worst-first" order.
1100 final ArrayList<String> r = new ArrayList<String>(_scorerByGoodness.get(baseName));
1101 // Convert to "best-first" order.
1102 Collections.reverse(r);
1103 final int size = r.size();
1104 final int limit = size/4;
1105 // Eliminate all those with non-positive goodness if possible,
1106 // but don't remove the best ~25% however poor they may appear to be
1107 // else we will not have any material to breed (and try to improve) from.
1108 for(int i = size; --i > limit; )
1109 {
1110 final int goodness = ScoreAndConf.computeScorerGoodness(_scorerScores.get(r.get(i)).second);
1111 if(goodness > 0) { break; }
1112 // Remove this trailing zero-goodness item.
1113 r.remove(i);
1114 }
1115 // Result must not be empty unless the sub-population is.
1116 assert(_scorerByGoodness.get(baseName).isEmpty() || !r.isEmpty());
1117
1118 // Wrap the result for immutablity and cache it.
1119 final List<String> result = Collections.unmodifiableList(r);
1120 _gBS_cache.putIfAbsent(baseName, result); // Reduce chance of update races.
1121
1122 return(result);
1123 }
1124 }
1125
1126 /**Interface of call-back hook to report a new "best" Scorer by name, eg to the system event record.
1127 */
1128 public interface NewBestCallbackIF
1129 {
1130 /**Used to post a new "best of breed" Scorer name and parameters for posterity.
1131 * This routine should not throw any exceptions.
1132 * @param scorerNameAndParameters valid name-and-parameters set; never null.
1133 */
1134 public void reportNewBestScorer(final String scorerNameAndParameters);
1135 }
1136
1137
1138 /**Small thread-safe LRU cache for getScorerInstance(); never null.
1139 * Intended to avoid repeated re-parsing of args and recreation of instances, etc,
1140 * for very-commonly-used Scorers used to evaluate every exhibit.
1141 * Usually we'd expect this to be a handful actually needed regularly,
1142 * but while hunting for new solutions (or scavenging for space)
1143 * we'll expect to churn this cache quite hard.
1144 */
1145 private final SimpleLRUMapAutoSizeForHitRate<String, ScorerIF> _gSI_cache;
1146
1147 /**Get Scorer instance given the Scorer{:value=name}* format; null if no such Scorer available.
1148 * Any instance returned may be a shared/cached instance rather than a new instance.
1149 * <p>
1150 * This does not attempt to canonicalise the names it is given,
1151 * so may create and cache duplicates for example.
1152 * <p>
1153 * This can be used to create any valid Scorer instance,
1154 * not just entries in the given population.
1155 */
1156 public ScorerIF getScorerInstance(final String nameAndParameters)
1157 {
1158 if(nameAndParameters == null) { throw new IllegalArgumentException(); }
1159
1160 // Try lookup in the cache; return a cached result without further ado!
1161 final ScorerIF cached = _gSI_cache.get(nameAndParameters);
1162 if(cached != null) { return(cached); }
1163
1164 // Quick extraction of base name.
1165 final int fs = nameAndParameters.indexOf(AbstractScorer.SEPARATOR);
1166 final boolean hasParams = (fs != -1);
1167 final String baseName = hasParams ? nameAndParameters.substring(0, fs) : nameAndParameters;
1168
1169 // Get a base instance of the Scorer, if any.
1170 final ScorerIF sc = baseScorers.get(baseName);
1171 // If this base name is not known/supported, stop now.
1172 if(sc == null) { return(null); }
1173
1174 // Fast-track the simple no-parameters case.
1175 if(!hasParams) { return(sc); }
1176
1177 try
1178 {
1179 // Use base non-parameterised instance to create variant.
1180 final ScorerIF newInstance = sc.createVariant(nameAndParameters);
1181 assert(newInstance != null);
1182 // Cache and return this value.
1183 _gSI_cache.put(nameAndParameters, newInstance);
1184 return(newInstance);
1185 }
1186 // Handle a simple argument-format error by propagating up to caller as usual.
1187 catch(final IllegalArgumentException e) { throw e; }
1188 // Log and propagate any other error.
1189 catch(final Exception e) { e.printStackTrace(); throw new Error(e); }
1190 }
1191
1192 }