001    /*
002    Copyright (c) 1996-2009, Damon Hart-Davis
003    All rights reserved.
004    
005    Redistribution and use in source and binary forms, with or without
006    modification, are permitted provided that the following conditions are
007    met:
008    
009      * Redistributions of source code must retain the above copyright
010        notice, this list of conditions and the following disclaimer.
011    
012      * Redistributions in binary form must reproduce the above copyright
013        notice, this list of conditions and the following disclaimer in the
014        documentation and/or other materials provided with the
015        distribution.
016    
017    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
018    IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
019    TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
020    PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
021    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
022    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
023    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
024    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
025    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
026    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
027    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
028    */
029    
030    package org.hd.d.pg2k.ai.scorer;
031    
032    import java.io.IOException;
033    import java.util.ArrayList;
034    import java.util.Collection;
035    import java.util.Collections;
036    import java.util.Comparator;
037    import java.util.HashMap;
038    import java.util.HashSet;
039    import java.util.Iterator;
040    import java.util.List;
041    import java.util.Map;
042    import java.util.NavigableSet;
043    import java.util.Set;
044    import java.util.SortedSet;
045    import java.util.TreeSet;
046    import java.util.concurrent.ConcurrentHashMap;
047    import java.util.concurrent.ConcurrentMap;
048    import java.util.concurrent.atomic.AtomicInteger;
049    
050    import org.hd.d.pg2k.svrCore.GenUtils;
051    import org.hd.d.pg2k.svrCore.SimpleLoggerIF;
052    import org.hd.d.pg2k.svrCore.MemoryTools.SimpleLRUMapAutoSizeForHitRate;
053    import org.hd.d.pg2k.svrCore.Tuple.Pair;
054    import org.hd.d.pg2k.svrCore.datasource.SimpleExhibitPipelineIF;
055    import org.hd.d.pg2k.svrCore.vars.EventPeriod;
056    import org.hd.d.pg2k.svrCore.vars.SimpleVariableValue;
057    import org.hd.d.pg2k.svrCore.vars.SystemVariables;
058    
059    import ORG.hd.d.IsDebug;
060    
061    
062    /**Thread-safe container for Scorer-and-parameter information to maintain an evolving population.
063     * This separates Scorers by basic (parameterless) name,
064     * on the grounds that they are not directly comparable.
065     * <p>
066     * This has limited capacity (so as to cap resource use),
067     * and discards the least good parameterised Scorers as needed
068     * to make way for new (better) instances.
069     * (These discarded less-good Scorers may have their results retained
070     * in a smalled secondary "negative" cache to speed up re-testing.)
071     * <p>
072     * This has the automatially persists/announces/shares new/good instances
073     * using the system variables events mechanism, local and global.
074     * <p>
075     * TODO: in case of accute memory shortage abandon least-good Scorers
076     */
077    public final class ScorerPopulation
078        {
079        /**Construct with a list of the base Scorers (names, no parameters) to keep track of.
080         * Note that we may slightly exceed the maximum specified population size for expediency,
081         * and we may dump some (less good) part of the population under memory stress.
082         *
083         * @param baseScorers  non-null, non-empty, pure-base-name seed set
084         * @param maxPopulationSize  maximum number of Scorer details to retain in memory,
085         *     possibly excluding the fixed/base Scorers, strictly positive;
086         *     this value is advisory and may be slightly exceeded for performance
087         * @param callback  used to report new best-of-breed being inserted into cache;
088         *     may be null
089         */
090        public ScorerPopulation(final Map<String, ScorerIF> baseScorers,
091                                final NewBestCallbackIF callback,
092                                final int maxPopulationSize)
093            {
094            this.callback = callback;
095    
096            if(baseScorers == null) { throw new IllegalArgumentException(); }
097            // Store immutable (defensive) copy of base Scorer names/instances.
098            this.baseScorers = Collections.unmodifiableMap(new HashMap<String, ScorerIF>(baseScorers));
099            if(this.baseScorers.isEmpty()) { throw new IllegalArgumentException(); }
100            // Validate the Scorer names and instances.
101            for(final String n : this.baseScorers.keySet())
102                {
103                // Validate the name, and make sure that there are no parameters.
104                if(!AbstractScorer.parseNameAndParameters(n).second.isEmpty())
105                    { throw new IllegalArgumentException("no parameters allowed on seed set"); }
106                // Ensure that the name corresponds to the actual Scorer instance supplied.
107                final ScorerIF scorer = baseScorers.get(n);
108                if((scorer == null) || !n.equals(scorer.getBaseName()))
109                    { throw new IllegalArgumentException("mismatch between Scorer base name and instance"); }
110                if(!n.equals(scorer.getNameAndParameters()))
111                    { throw new IllegalArgumentException("supplied Scorer instance was parameterised"); }
112                }
113            // Create the Set of valid classes for quick validation of instances.
114            supportedScorerClasses = new HashSet<Class<? extends ScorerIF>>(2 * this.baseScorers.size());
115            for(final ScorerIF sc : this.baseScorers.values())
116                { supportedScorerClasses.add(sc.getClass()); }
117    
118            // Must be at least two slots per base Scorer type/name
119            // (to allow for the base/parameterless value and one 'optimal' value).
120            if(maxPopulationSize <= 2*this.baseScorers.size()) { throw new IllegalArgumentException("population too small"); }
121            this.maxPopulationSize = maxPopulationSize;
122    
123            // Size to accommodate the specified capacity without further expansion.
124            // We don't expect much write concurrency.
125            // We use a lowish load-factor to enhance read and write performance.
126            _scorerScores = new ConcurrentHashMap<String, Pair<Long, ScoreAndConf>>(2*maxPopulationSize, 0.5f, 4);
127    
128            // The comparator arranges for the items to be sorted by increasing "goodness".
129            // Because of a lack of symmetry in scoring
130            // (ie reversing the prediction of a bad Scorer does not necessarily make for a good prediction)
131            // we are most interested in the highest-confidence, positive-correlation Scorers.
132            SCORER_RANK_COMPARATOR = new Comparator<String>()
133                {
134                // This treats missing items from _scorerScores as neutral (goodness 0).
135                public final int compare(final String a, final String b)
136                    {
137                    final ScoreAndConf sacA = _scorerScores.get(a).second;
138                    final ScoreAndConf sacB = _scorerScores.get(b).second;
139                    // Primary sort by goodness.
140                    final int goodnessA = (sacA == null) ? 0 : ScoreAndConf.computeScorerGoodness(sacA);
141                    final int goodnessB = (sacB == null) ? 0 : ScoreAndConf.computeScorerGoodness(sacB);
142                    if(goodnessA < goodnessB) { return(-1); }
143                    if(goodnessA > goodnessB) { return(+1); }
144                    if((sacA != null) && (sacB != null))
145                        {
146                        // Secondary sort by score (if both scores available).
147                        if(sacA.score < sacB.score) { return(-1); }
148                        if(sacA.score > sacB.score) { return(+1); }
149                        }
150                    // Break ties by name to ensure a total ordering.
151                    return(a.compareTo(b));
152                    }
153                };
154            // Create the (immutable) outer map with empty sub-maps for _scorerByGoodness.
155            final Map<String, NavigableSet<String>> sbg = new HashMap<String, NavigableSet<String>>(2 * baseScorers.size());
156            for(final String n : this.baseScorers.keySet())
157                { sbg.put(n, new TreeSet<String>(SCORER_RANK_COMPARATOR)); }
158            _scorerByGoodness = Collections.unmodifiableMap(sbg);
159    
160            // Backup LRU cache, mainly for "bad" Scorer results.
161            // This also accommodates 'clones' evicted from the main population.
162            final int _backstopLRUMaxSize = 1 + maxPopulationSize/2;
163            _backstopLRU = SimpleLRUMapAutoSizeForHitRate.<String, Pair<Long, ScoreAndConf>>create(1+(_backstopLRUMaxSize/8), _backstopLRUMaxSize, "_backstopLRU");
164    
165            // Create Scorer lookup cache, size limited, in proportion to max population size.
166            // This allows us to hold in cache every live Scorer in the main population.
167            // This assumes that each Scorer instance does not consume much memory/resource.
168            // Since we may well need to churn through the entire population sequentially,
169            // eg during scavenging or possibly while selecting a breeding set,
170            // this cache should be significantly larger than the population size to be of most value.
171            // TODO: should possibly be held via SoftReference to be dropped during extreme memory stress.
172            _gSI_cache = SimpleLRUMapAutoSizeForHitRate.<String, ScorerIF>create(1+(maxPopulationSize/8), 1 + ((3*maxPopulationSize)/2), "_gSI_cache");
173            }
174    
175        /**Comparator used to rank Scorers relative to one another by increasing goodness, never null.
176         * Scorer values not currently represented in the population
177         * are treated as if goodness 0 (ie neutral).
178         * <p>
179         * If both goodnesses are available a secondary ranking is done
180         * by increasing score.
181         * <p>
182         * Complete ties are broken by Scorer name.
183         */
184        public final Comparator<String> SCORER_RANK_COMPARATOR;
185    
186        /**If true then we may kick out apparent-near duplicates as the population reaches maximum.
187         * The aim of doing this is to increase diversity of the 'gene pool' for a given capacity,
188         * by making space for significantly different genotypes
189         * and avoiding them being swamped or pushed out by many near-clones of the overal best Scorer
190         * in any given sub-population.
191         * <p>
192         * This scavenging may have a random element.
193         * <p>
194         * Scavenging may be quite slow.
195         */
196        private static final boolean SCAVENGE_FOR_DIVERSITY = true;
197    
198        /**Reset/clear the population to its initial (empty) state.
199         * Does not shut out all other activity,
200         * so it is possible that new items may be inserted while this is running.
201         */
202        public void clear()
203            {
204            // First clear the back-stop cache.
205            _backstopLRU.clear();
206            // Now systematically clear the main data stores and caches under the data lock.
207            synchronized(_lock)
208                {
209                _clearCaches();
210                _scorerScores.clear();
211                for(final NavigableSet<String> nm : _scorerByGoodness.values())
212                    { nm.clear(); }
213                }
214            // Finally clear out the Scorer instance cache.
215            _gSI_cache.clear();
216            }
217    
218        /**Construct with a list of the base Scorers (names, no parameters) to keep track of.
219         * Note that we may slightly exceed the maximum specified population size for expediency,
220         * and we may dump some (less good) part of the population under memory stress.
221         * <p>
222         * This automatically posts new best-of-breed Scorers to the system variables event store
223         * as they are cached.
224         *
225         * @param baseScorers  non-null, non-empty, pure-base-name seed set
226         * @param maxPopulationSize  maximum number of Scorer details to retain in memory,
227         *     possibly excluding the fixed/base Scorers, strictly positive;
228         *     this value is advisory and may be slightly exceeded for performance
229         * @param vars  system variables pipeline to record new/best Scorer events to
230         */
231        public ScorerPopulation(final Map<String, ScorerIF> baseScorers,
232                                final int maxPopulationSize,
233                                final SimpleExhibitPipelineIF vars)
234            {
235            this(baseScorers,
236                 (new NewBestCallbackIF() {
237                    public final void reportNewBestScorer(final String nameAndParameters)
238                        {
239                        final SimpleVariableValue svvL = new SimpleVariableValue(SystemVariables.AI_SCORER_STRING_LOCAL_EVENT,
240                                                                                nameAndParameters);
241                        try { vars.setVariable(svvL); }
242                        catch(final IOException e) { e.printStackTrace(); /* Complain but continue after error. */ }
243                        try
244                            {
245                            if(USE_GLOBAL_EVENTS_ALSO &&
246                                    "true".equalsIgnoreCase(vars.getGenProps(-1).getGen().get(GEN_PROPS_GLOBAL_POST_ENABLE_SUFFIX)))
247                                {
248                                final SimpleVariableValue svvG = new SimpleVariableValue(SystemVariables.AI_SCORER_STRING_GLOBAL_EVENT,
249                                        nameAndParameters);
250                                try { vars.setVariable(svvG); }
251                                catch(final IOException e) { e.printStackTrace(); /* Complain but continue after error. */ }
252                                }
253                            }
254                        catch(final IOException e) { e.printStackTrace(); /* Complain but continue after error. */ }
255                        }
256                    }),
257                 maxPopulationSize);
258            }
259    
260        /**Maximum number of Scorer details to retain, possibly excluding the fixed/base Scorers, strictly positive.
261         * Must in fact be large than the number of base Scorer types,
262         * since we will need at least once slot for each non-parameterised type.
263         */
264        private final int maxPopulationSize;
265    
266        /**Immutable Map from base Scorer names (no parameters) to base/parameterless instances to monitor/maintain; non-null, non-empty. */
267        private final Map<String, ScorerIF> baseScorers;
268    
269        /**Set of class values for the supported Scorer types for fast validation; never null. */
270        private final Set<Class<? extends ScorerIF>> supportedScorerClasses;
271    
272        /**Callback to report new best-of-breed being inserted into cache; may be null. */
273        private final NewBestCallbackIF callback;
274    
275        /**A back-stop LRU cache of Scorer goodness, so that in particular the results of poor Scorers need not be excessively recomputed; never null.
276         * Normally we only keep the ScoreAndConf of the best Scorers,
277         * but if we happen to be repeatedly asked for the results of poor Scorers
278         * and we don't hold those results because they are not good to be in the main population
279         * then we may simply waste disproportionate time recomputing them.
280         * <p>
281         * This aims to be a bonus back-stop cache of all results not in the main cache,
282         * and is not strictly part of the counted population
283         * because these are not part of the potential "breeding set":
284         * they are here only for "negative cacheing" of poor Scorer parameters sets.
285         * <p>
286         * We keep it much smaller than the main population
287         * so that it is not a significant resource drain.
288         * <p>
289         * We push Scorers into it when they are displaced from the main population.
290         * <p>
291         * Thread-safe.
292         * <p>
293         * It is permissible and viable to hold a lock on this object over multiple operations
294         * to make them into one compound atomic operation.
295         */
296        private final SimpleLRUMapAutoSizeForHitRate<String, Pair<Long, ScoreAndConf>> _backstopLRU;
297    
298        /**Non-user-visible  lock to serialise activity on _scorerScores and _scorerByGoodness; never null.
299         * No blocking/external calls are made in the scope of this lock
300         * so as to ensure that it is held only for very short intervals.
301         */
302        private final Object _lock = new Object();
303    
304        /**Thread-safe cache of Scorer scores (and time when cached), indirectly size-limited; never null.
305         * Map from scorer name-and-parameters to ScoreAndConfidence and the time computed/cached.
306         * <p>
307         * Single get() accesses to this can be made safely without further locking
308         * so as to maximise concurrency.
309         * <p>
310         * Note that since a put() operation does not lock out get() operations,
311         * any updates on the main _scorerScores map must be incrementally valid,
312         * ie incorrect data must never be inserted into the _scorerScores map.
313         */
314        private final ConcurrentMap<String, Pair<Long, ScoreAndConf>> _scorerScores;
315    
316        /**Map from base Scorer name to a least-good-first NavigableSet of full nameAndParameters; never null.
317         * We pre-populate this with empty maps under each base name that we handle
318         * (so it is never necessary to test for presence in, or ever alter, this outer map; hence it is immutable).
319         * <p>
320         * The top-level Comparator is selected to sort by goodness derived from the _scorerScores,
321         * so any extant entry in the nameAndParameters should be removed from the _scorerByGoodness map
322         * before removal from or replacement in the _scorerScores map,
323         * and re-inserted afterwards as necessary.
324         * <p>
325         * Since the different base Scorer names/types are held separately,
326         * this helps keep the probably-distinct populations from competing with one another,
327         * though the overall population size cap is still present.
328         * <p>
329         * Because of a potential lack of symmetry in scoring
330         * (ie reversing the prediction of a bad Scorer does not necessarily make it a good one)
331         * we are interested in preserving the highest-confidence, positive-correlation Scorers.
332         * <p>
333         * Not thread-safe; access only under _lock.
334         */
335        private final Map<String, NavigableSet<String>> _scorerByGoodness;
336    
337        /**ScoreAndConfidence for the given Scorer over all exhibit types; null if not known.
338         * Never recomputes, always fetches from its cache if present, else returns null.
339         * <p>
340         * Older results (typically up to 1 day older) may be returned if allowStale is true;
341         * this may help the system overall ride out temporary I/O problems and disconnections.
342         * <p>
343         * These get operations are generally fast,
344         * though some cycle-stealing incremental housekeeping might be performed on this call.
345         *
346         * @param scorer  the Scorer instance; never null
347         * @param allowStale  if true then allow an older cache response
348         *     (up to VLONG period older)
349         *
350         * @return  the score represents the correlation with the underlying votes
351         *     (and whatever the scoring is measured against)
352         *     with MAX meaning perfect correlation, 0 meaning no correlation,
353         *     and -MAX meaning perfectly wrong answers all the time,
354         *     and the confidence 0 if we have no (or very few) data points yet
355         *     and approaching MAX as we have a large (enough) number of data points
356         *
357         * @throws IllegalArgumentException  if the Scorer is null
358         */
359        public ScoreAndConf getScorerWeighting(final ScorerIF scorer,
360                                               final boolean allowStale)
361            {
362            if(scorer == null) { throw new IllegalArgumentException(); }
363            return(getScorerWeighting(scorer.getNameAndParameters(), allowStale));
364            }
365    
366        /**ScoreAndConfidence for the given Scorer over all exhibit types; null if not known.
367         * Never recomputes, always fetches from its cache if present, else returns null.
368         * <p>
369         * Older results (typically up to 1 day older) may be returned if allowStale is true;
370         * this may help the system overall ride out temporary I/O problems and disconnections.
371         * <p>
372         * These get operations are generally fast,
373         * though some cycle-stealing incremental housekeeping might be performed on this call.
374         *
375         * @param scorerNameAndParameters  the name and parameters of the Scorer; never null
376         * @param allowStale  if true then allow an older cache response
377         *     (up to VLONG period older)
378         *
379         * @return  the score represents the correlation with the underlying votes
380         *     (and whatever the scoring is measured against)
381         *     with MAX meaning perfect correlation, 0 meaning no correlation,
382         *     and -MAX meaning perfectly wrong answers all the time,
383         *     and the confidence 0 if we have no (or very few) data points yet
384         *     and approaching MAX as we have a large (enough) number of data points
385         */
386        public ScoreAndConf getScorerWeighting(final String scorerNameAndParameters,
387                                               final boolean allowStale)
388            {
389            if(scorerNameAndParameters == null) { throw new IllegalArgumentException(); }
390    
391            final long currentPeriod = EventPeriod.VLONG.getIntervalNumber(System.currentTimeMillis());
392    
393            // We don't need to validate the requested name (other than as being non-null)
394            // since an invalid value will correctly return null.
395            // We don't need to take any locks for this single get();
396            // this helps maximise concurrency and can proceed
397            // even when a long/slow put() operation is taking place in another thread.
398            final Pair<Long, ScoreAndConf> cachedValue = _scorerScores.get(scorerNameAndParameters);
399            // We can use a value from cache if the cached value is from the current period
400            //     (or the previous period if allowStale is true).
401            if((cachedValue != null) &&
402               (EventPeriod.VLONG.getIntervalNumber(cachedValue.first) >= (allowStale ? currentPeriod-1 : currentPeriod)))
403                { return(cachedValue.second); }
404    
405            // Try back-stop LRU cache.
406            // We access this out of the scope of the main lock to maximise concurrency.
407            // Note that we allow use of older values than from the main population.
408            // We actively remove expired items to make more room in this back-stop cache.
409            synchronized(_backstopLRU)
410                {
411                final Pair<Long, ScoreAndConf> backstopResult = _backstopLRU.get(scorerNameAndParameters);
412                if(backstopResult != null)
413                    {
414                    final long oldestRetained = currentPeriod-2;
415                    final long cachedItemAge = EventPeriod.VLONG.getIntervalNumber(backstopResult.first);
416                    if(cachedItemAge < oldestRetained)
417                        {
418                        // Remove expired value to make room for others.
419                        _backstopLRU.remove(scorerNameAndParameters);
420                        }
421                    else if(cachedItemAge >= (allowStale ? oldestRetained : currentPeriod))
422                        {
423    //if(IsDebug.isDebug) { System.out.println("[ScorerPopulation.getScorerWeighting(): caught lookup with back-stop LRU cache.]"); }
424                        return(backstopResult.second);
425                        }
426                    }
427                }
428    
429            // Nothing in either cache...
430            return(null);
431            }
432    
433        /**GenProps 'generic' parameter that must be set for us to post to the global event store. */
434        public static final String GEN_PROPS_GLOBAL_POST_ENABLE_SUFFIX = "ai.scorerGlobalPostEnable";
435    
436        /**Save the ScoreAndConfidence for the given Scorer over all exhibit types.
437         * Saves the given Scorer and "goodness",
438         * if necessary evicting a poorly-performing Scorer to make room
439         * ie to limit the population size.
440         * <p>
441         * If the "put" Scorer is very good compared to the existing population members
442         * then it may get persisted/shared via the event mechanism.
443         * <p>
444         * This routine will only consider storing entries for Scorers whose base name
445         * is on the list we were constructed with.
446         * <p>
447         * Note that a put() operation does not lock out get() operations,
448         * so any updates on the main _scorerScores map must be incrementally valid,
449         * ie incorrect data must never be inserted into the _scorerScores map.
450         * <p>
451         * This will attempt to cache the actual Scorer instance provided
452         * which may save time reconstructing it later, and memory by better sharing.
453         * This instance may be recovered with getScorerInstance() later,
454         * though may have to be recreated (ie returning the same instance is never guaranteed).
455         *
456         * @param scorer  the Scorer instance; never null
457         * @param sac  rating of this Scorer across all exhibits; never null
458         * @param dataSource  to record/persist/share good Scorers through;
459         *     null if no Scorer persistence is required
460         * @param log logger; never null
461         *
462         * @return  true if the Scorer result is new for its sub-population and best in it
463         *     and has a positive overall "goodness" (ie has some positive predictive ability)
464         *
465         * @throws IllegalArgumentException  if the Scorer is null or of an unsupported type
466         */
467        public boolean putScorerWeighting(final ScorerIF scorer,
468                                          final ScoreAndConf sac,
469                                          final SimpleExhibitPipelineIF dataSource,
470                                          final SimpleLoggerIF log)
471            {
472            if(scorer == null) { throw new IllegalArgumentException(); }
473            if(!supportedScorerClasses.contains(scorer.getClass())) { throw new IllegalArgumentException("unsupported Scorer class"); }
474            final String nameAndParameters = scorer.getNameAndParameters();
475            _gSI_cache.put(nameAndParameters, scorer);
476            return(putScorerWeighting(nameAndParameters, sac, dataSource, log));
477            }
478    
479        /**Save the ScoreAndConfidence for the given Scorer over all exhibit types.
480         * Saves the given Scorer and "goodness",
481         * if necessary evicting a poorly-performing Scorer to make room
482         * ie to limit the population size.
483         * <p>
484         * If the "put" Scorer is very good compared to the existing population members
485         * then it may get persisted/shared via the event mechanism.
486         * <p>
487         * This routine will only consider storing entries for Scorers whose base name
488         * is on the list we were constructed with.
489         * <p>
490         * Note that a put() operation does not lock out get() operations,
491         * so any updates on the main _scorerScores map must be incrementally valid,
492         * ie incorrect data must never be inserted into the _scorerScores map.
493         *
494         * @param nameAndParameters  the name and parameters of the scorer; never null
495         * @param sac  rating of this Scorer across all exhibits; never null
496         * @param dataSource  to record/persist/share good Scorers through;
497         *     null if no Scorer persistence is required
498         * @param log logger; never null
499         *
500         * @return  true if the Scorer result is new for its sub-population and best in it
501         *     and has a positive overall "goodness" (ie has some positive predictive ability)
502         */
503        public boolean putScorerWeighting(final String nameAndParameters,
504                                          final ScoreAndConf sac,
505                                          final SimpleExhibitPipelineIF dataSource,
506                                          final SimpleLoggerIF log)
507            {
508            if(sac == null) { throw new IllegalArgumentException(); }
509            if(nameAndParameters == null) { throw new IllegalArgumentException(); }
510    
511            final Pair<String, Map<String, String>> nap = AbstractScorer.parseNameAndParameters(nameAndParameters);
512            final String baseName = nap.first;
513            // Ignore if not one of our recognised base Scorers.
514            if(!baseScorers.keySet().contains(baseName)) { return(false); }
515            final int goodness = ScoreAndConf.computeScorerGoodness(sac);
516            final boolean parameterless = nap.second.isEmpty();
517    
518            final long startTime = System.currentTimeMillis();
519    
520            // Set true if we discover that this is the best Scorer of its type so far
521            // so we should save/share it.
522            boolean isNewAndBest = false;
523    
524            // Still a candidate for storing, so grab a lock...
525            synchronized(_lock)
526                {
527                // Clear caches within the lock scope to avoid/minimise races.
528                _clearCaches();
529    
530                final SortedSet<String> insertedScorerSubPopulation = _scorerByGoodness.get(baseName);
531                // Compute previous 'best' score in this sub-population if any.
532                final int prevBest = insertedScorerSubPopulation.isEmpty() ? Integer.MIN_VALUE :
533                    ScoreAndConf.computeScorerGoodness(_scorerScores.get(insertedScorerSubPopulation.last()).second);
534    
535                // Remove and return any extant value for this Scorer value, null if none previously.
536                final Pair<Long, ScoreAndConf> oldVal = _remove(nameAndParameters);
537    
538                // Insert the value: first in _scorerScores, then in the dependent _scorerByGoodness.
539                // We DO NOT intern() the possibly-long name because doing so would stress the PermGen.
540                _scorerScores.put(nameAndParameters, new Pair<Long, ScoreAndConf>(startTime, sac));
541                insertedScorerSubPopulation.add(nameAndParameters);
542    
543                // Test if this is the best Scorer in its sub-population
544                // and has a non-zero goodness.
545                // Note that this has already been inserted into that sub-population,
546                // so cannot be "better",
547                // so we test against the previous 'best' value if any.
548                assert(goodness <= ScoreAndConf.computeScorerGoodness(_scorerScores.get(insertedScorerSubPopulation.last()).second));
549                if((goodness > 0) && (goodness > prevBest) && (oldVal == null))
550                    { isNewAndBest = true; }
551    
552                // If the population is oversize following this insertion
553                // then we'll cull the worst Scorer from the largest sub-population if possible.
554                if(_scorerScores.size() > maxPopulationSize)
555                    {
556                    // Find the largest sub-population.
557                    int largestSubPopulation = -1;
558                    String largestSubPopulationBasename = null;
559                    for(final String subPopBaseName : _scorerByGoodness.keySet())
560                        {
561                        final int subPopSize = _scorerByGoodness.get(subPopBaseName).size();
562                        if(subPopSize > largestSubPopulation)
563                            {
564                            largestSubPopulation = subPopSize;
565                            largestSubPopulationBasename = subPopBaseName;
566                            }
567                        }
568                    assert(largestSubPopulation > 0);
569                    assert(largestSubPopulationBasename != null);
570    
571                    // Don't attempt scavenging all the time, since it may be very expensive, ie O(n).
572                    // We try to reduce the average time scavenging to about O(1) on average.
573                    // Scavenging is likely to work more effectively per-call if called infrequently.
574                    if(SCAVENGE_FOR_DIVERSITY)
575                        {
576                        // Attempt one scavenging (clone-stripping) run on this (largest) sub-population.
577                        _scavenge(largestSubPopulationBasename);
578                        }
579    
580                    // If scavenging failed to recover any space then evict the least-good Scorer.
581                    if(_scorerScores.size() > maxPopulationSize)
582                        {
583                        // ZAP LEAST-GOOD SCORER SO AS TO CAP THE POPULATION SIZE IF (STILL) OVERSIZE...
584                        // Iterate through the largest population
585                        // culling the first (least good) parameterised Scorer(s) found, if any.
586                        // We may not be able to remove anything, leaving the population slightly oversized.
587                        final SortedSet<String> victimSubPop = _scorerByGoodness.get(largestSubPopulationBasename);
588                        for(final String putativeVictim : victimSubPop)
589                            {
590                            // We never remove the parameterless base Scorer from its sub-population.
591                            if(putativeVictim.indexOf(AbstractScorer.SEPARATOR) == -1) { continue; }
592                            // We push this about-to-be-zapped victim into the back-stop LRU cache...
593                            _backstopLRU.put(putativeVictim, _scorerScores.get(putativeVictim));
594                            // Now remove the victim from the main data stores.
595                            final Pair<Long, ScoreAndConf> zapped = _remove(putativeVictim);
596                            assert(zapped != null); // Must have actually removed the specified 'poor' Scorer.
597                            // Having trimmed the population (and altered the collections) we must now stop.
598                            break;
599                            }
600                        }
601                    }
602                }
603    
604            // If this a new/best +ve-goodness Scorer (in its sub-population) then post it locally
605            // if not or parameterless, and if we have a live dataSource.
606            // This is done outside of the lock scope
607            // to avoid blocking other get/put operations during I/O.
608            // We absorb (and report) any I/O error, but continue.
609            if(isNewAndBest && !parameterless && (callback != null))
610                {
611    if(IsDebug.isDebug) { log.log("[ScorerPopulation: new best Scorer: "+nameAndParameters+" "+sac+", 'goodness'="+goodness+"...]"); }
612                callback.reportNewBestScorer(nameAndParameters);
613                }
614    
615            return(isNewAndBest);
616            }
617    
618        /**Remove and return any extant value for this Scorer value, null if none previously.
619         * Must only be called under the data lock to be atomic wrt the data stores.
620         *
621         * @param nameAndParameters  non-null valid Scorer value incluing optional parameters
622         *
623         * @return  previous timestamp/value for this Scorer, else null if nne
624         */
625        private synchronized Pair<Long, ScoreAndConf> _remove(final String nameAndParameters)
626            {
627            assert(Thread.holdsLock(_lock)); // This lock must be held for safety.
628    
629            // See if we have an existing entry for this nameAndParameters.
630            // (If not, there is nothing to remove.)
631            final Pair<Long, ScoreAndConf> oldVal = _scorerScores.get(nameAndParameters);
632            if(oldVal == null) { return(null); }
633            // If we do, then remove the corresponding entry from _scorerByGoodness first
634            // else we will break its comparator/lookup which depends on _scorerScores.
635            // Quickly extract the base name of the Scorer to select the sub-population.
636            final int firstSep = nameAndParameters.indexOf(AbstractScorer.SEPARATOR);
637            final String baseName = (firstSep == -1) ? nameAndParameters : (nameAndParameters.substring(0, firstSep));
638            // Remove from the appropriate sub-population first...
639            final SortedSet<String> subPopulation = _scorerByGoodness.get(baseName);
640            assert (subPopulation != null);
641            assert (subPopulation.contains(nameAndParameters));
642            subPopulation.remove(nameAndParameters);
643            assert (!subPopulation.contains(nameAndParameters));
644            // Then remove from the main mapping, ie _scorerScores.
645            _scorerScores.remove(nameAndParameters);
646            return(oldVal);
647            }
648    
649        /**Countdown to next scavenge: when zero then scavenge and add on current population size. */
650        private final AtomicInteger countdownToNextScavenge = new AtomicInteger();
651    
652        /**Minimum 'delta' in score for two Scorers to be considered potentially similar; non-negative.
653         * This is the minimum difference in the raw score, not weighted by confidence,
654         * for two Scorers on similar overall goodness to be considered 'similar',
655         * given that we will then further filter by genotype and/or phenotype,
656         * the assumption being that differences of this scale may just be (rounding and other) noise.
657         * <p>
658         * A good value for this is probably less than 1% of maximum,
659         * even though many Scorers will struggle to be much better than chance than this.
660         * <p>
661         * Note that we may apply a tighter test to values very close to zero.
662         * <p>
663         * A value of zero indicates that scores have to be identical
664         * to be considered similar by this measure.
665         */
666        private static final int SIMILAR_SCORE_DELTA = Math.max(2, ScoreAndConf.MAX_GOODNESS >>> 7);
667    
668        /**If true then if a minor scavenge fails always force a major scavenge.
669         * This may be expensive but more-or-less guarantees
670         * that there will always be some space in the population
671         * for marginal but unusual Scorers to have a chance to be bred from.
672         */
673        private static final boolean FORCE_MAJOR_COLLECTION_AFTER_MINOR_FAILS = true;
674    
675        /**Attempt to strip out near-clones from the population to make space for more diversity.
676         * This works on the assumption that there may be many valuable Scorer values
677         * that while not optimal across all exhibits or at this very moment,
678         * may be valuable on sub-sets of the exhibits now or in the near future.
679         * This routine attempts to prevent limited population space (gene-pool capacity)
680         * from being overrun with very slight variations on (effective clones of) a monoculture.
681         * <p>
682         * Currently this is done by elimination of genotypic near-clones
683         * by eliminating those Scorers that have very similar Scores and no sufficiently-distinct genes.
684         * <p>
685         * TODO: In future this may be done by elimination of phenotypic near clones
686         * by eliminating those Scorers that are have the same predction performance over a given calbration set.
687         * <p>
688         * This may not trim out anything from the specified sub-population.
689         * <p>
690         * This will never remove the 'best' item nor the base/parameterless Scorer.
691         * <p>
692         * Values removed by scavenging may be pushed into the back-stop LRU cache
693         * in case they are used again soon by the outside world;
694         * thus we may limit the amount of any scavengine we do in one go
695         * so as to avoid overflowing this back-stop cache.
696         * <p>
697         * This tries a series of more drastic pruning measures until it zaps some/enough Scorers,
698         * with the inital pruning passes being very quick and gentle.
699         * <p>
700         * This does not guarantee to remove any Scorers at all.
701         * <p>
702         * This must be called under the data lock.
703         * <p>
704         * Run time is expected to average about O(1) (peak O(n)) for a size-n sub-population.
705         *
706         * @param largestSubPopulationBasename
707         */
708        private void _scavenge(final String largestSubPopulationBasename)
709            {
710            assert((largestSubPopulationBasename != null) && (_scorerByGoodness.get(largestSubPopulationBasename) != null));
711            assert(Thread.holdsLock(_lock)); // This lock must be held for safety.
712    
713            final long startTime = System.currentTimeMillis();
714            final long currentTick = EventPeriod.VLONG.getIntervalNumber(startTime);
715    
716            // Compute the maximum number of Scorers to attempt to evict in this run.
717            final NavigableSet<String> subPop = _scorerByGoodness.get(largestSubPopulationBasename);
718    
719            // Record the size of the sub-population when we start the scavenge.
720            final int startSubPopSize = subPop.size();
721    
722            // MINOR scavenge limits.
723            // Cap this to a very slow-growing fraction of the sub-population size
724            // so as to keep the mean run-time very close to O(1).
725            final int maxMinorPurgeCount = Math.min(2+GenUtils.log2Approx(startSubPopSize), startSubPopSize-2);
726            // Return immediately if not big enough for us to do even a decent minor scavenge from.
727            if(maxMinorPurgeCount < 2) { return; }
728    
729    //if(IsDebug.isDebug) { System.out.println("[ScorerPopulation: scavenging...  Sub-population size = "+(startSubPopSize)+".]"); }
730    
731            // The set of Scorers that we wish to evict...
732            final Set<String> evictees = new HashSet<String>(16 + 2*maxMinorPurgeCount);
733    
734            // Do MINOR purge...
735            // Remove anything from the extreme tail end (least goodness) part of the sub-population:
736            //   * Without merit (score < 0).
737            //   * Very similar to any item better than it in this tail.
738            //
739            // We copy this always-very-small tail into a new List, worst-first.
740            final List<String> tail = new ArrayList<String>(maxMinorPurgeCount);
741            final Iterator<String> tailIt = subPop.iterator();
742            for(int i = maxMinorPurgeCount; --i >= 0; ) { tail.add(tailIt.next()); }
743            assert(tail.size() == maxMinorPurgeCount);
744            assert(tail.size() < startSubPopSize); // Ensure that we can never zap the 'best' Scorer by mistake.
745            for(int i = 0; i < maxMinorPurgeCount; ++i)
746                {
747                final String n = tail.get(i);
748                // Never try and remove the base/parameterless Scorer from this sub-population.
749                if(n.indexOf(AbstractScorer.SEPARATOR) == -1) { continue; }
750                final ScoreAndConf sac = _scorerScores.get(n).second;
751                // Zap anything without a positive score/confidence.
752                if((sac.score <= 0) || (sac.confidence == 0))
753                    {
754                    evictees.add(n);
755                    _backstopLRU.put(n, _scorerScores.get(n)); // Push this into the back-stop list.
756                    continue;
757                    }
758                // Zap anything very at all to any Scorer better than it in the tail.
759                final ScorerIF sc = getScorerInstance(n);
760                assert(n != null);
761                for(int j = i+1; j < maxMinorPurgeCount; ++j)
762                    {
763                    // We need to be at least a little different to everything else better than us
764                    // if we are to be retained.
765                    if(AbstractScorer.verySimilar(sc, getScorerInstance(tail.get(j))))
766                        {
767                        assert(!n.equals(tail.get(j))); // All Scorers in the tail must be different...
768                        evictees.add(n);
769                        _backstopLRU.put(n, _scorerScores.get(n)); // Push this into the back-stop list.
770                        break;
771                        }
772                    }
773                }
774    
775            // If the minor scavenge identified any poor Scorer
776            // then zap them and avoid a major/slow scavenge if more than one was found.
777            if(!evictees.isEmpty())
778                {
779                // Must remove from sub-population before removing from _scorerScores.
780                subPop.removeAll(evictees);
781                _scorerScores.keySet().removeAll(evictees);
782    
783                // We can avoid a major collection if we did better than simply trimming one entry
784                // which the quick-and-easy default non-scavenge mechanism will do anyway.
785                if(evictees.size() > 1)
786                    {
787    //if(IsDebug.isDebug) { System.out.println("  [ScorerPopulation: minor scavenge successful "+evictees.size()+'/'+startSubPopSize+" Scorers.]"); }
788                    return;
789                    }
790    
791                evictees.clear(); // Don't need to carry these over to the major collection.
792                // Fall through to consider doing a major scavenge...
793                }
794    
795    
796            // MAJOR scavenge limits.
797            // Cap this to a small fraction of the sub-population size so as not to extinguish too much at once.
798            // Cap this also to a fraction of the LRU backstop cache size so as not to overwhelm/flush it.
799            final int maxPurgeCount = Math.min(subPop.size()/8, _backstopLRU.size()/2);
800            // Return immediately if not big enough for us to scavenge from...
801            if(maxPurgeCount < 1) { return; }
802    
803            // After failure of a minor scavenge to make much/any space
804            // we may wish to fall back to our default kill-the-worst-one tactic,
805            // or we may try to force a major collection likely to reclaim a decent chunk immediately.
806            if(!FORCE_MAJOR_COLLECTION_AFTER_MINOR_FAILS)
807                {
808                // Most of the time, quit before slow/major scavenges to keep the mean execution time low.
809                // We only return on > 0 so that this will fail and fall through the first time.
810                if(countdownToNextScavenge.getAndDecrement() > 0) { return; }
811                // Postpone the next major scavenge enough to keep its contribution ~ O(1).
812                countdownToNextScavenge.addAndGet(1+maxPurgeCount);
813                }
814    
815    //if(IsDebug.isDebug) { System.out.println("[ScorerPopulation: scavenging (MAJOR)...  Sub-population size = "+(subPop.size())+".]"); }
816    
817            // Work through the Scorers from worst to best,
818            // stopping when we've found enough candidates.
819            // We examine the Scorers in pairs,
820            // deleiberately leaving at least alternate values untouched on any one pass.
821            for(final Iterator<String> it = subPop.iterator(); it.hasNext() && (evictees.size() < maxPurgeCount); )
822                {
823                // Get 'less good' of the two Scorers that we will examine.
824                final String worse = it.next();
825    
826                // Never try and scavenge/remove the best (last) Scorer in this sub-population...
827                if(!it.hasNext()) { break; }
828                // Never try and remove the base/parameterless Scorer from this sub-population.
829                if(worse.indexOf(AbstractScorer.SEPARATOR) == -1) { continue; }
830    
831                // Get the 'better' of the two Scorers compared this time.
832                assert(it.hasNext()); // We know from earlier test that there is at least one more Scorer.
833                final String better = it.next();
834    
835                // If the worse of the pair is sufficiently old that we wouldn't return it even with allowStale
836                // then dump it immediately (don't even push it into the back-stop cache).
837                // This doesn't have to exactly match the cache behaviour (though doing so is good),
838                // but simply allows us to kick out very old Scorers that have not been re-evaluated at all recently
839                // so may be dead wood consuming valuable space.
840                final Pair<Long, ScoreAndConf> pairW = _scorerScores.get(worse);
841                final long tickWhenCached = EventPeriod.VLONG.getIntervalNumber(pairW.first);
842                if(currentTick - tickWhenCached > 1)
843                    {
844                    evictees.add(worse);
845    //if(IsDebug.isDebug) { System.out.println("  [ScorerPopulation: scavenged old Scorer: " + _scorerScores.get(worse) + ".]"); }
846                    continue;
847                    }
848    
849                // Only proceed if the scores of the two Scorers are reasonably similar.
850                final ScoreAndConf sacW = pairW.second;
851                final ScoreAndConf sacB = _scorerScores.get(better).second;
852                // For scores very close to zero we only treat identical scores as similar...
853                if((SIMILAR_SCORE_DELTA == 0) || (Math.abs(sacB.score) < (SIMILAR_SCORE_DELTA<<4)))
854                    { if(sacW.score == sacB.score) { continue; } }
855                else if(Math.abs(sacW.score - sacB.score) > SIMILAR_SCORE_DELTA) { continue; }
856    
857                // Check for genotypic similarity...
858                // Don't discard a 'worse' Scorer sufficiently different to its 'better'.
859                if(!AbstractScorer.verySimilar(getScorerInstance(worse), getScorerInstance(better))) { continue; }
860    
861                // Mark this Scorer for removal (and push into the back-stop cache immediately).
862                evictees.add(worse);
863                _backstopLRU.put(worse, _scorerScores.get(worse));
864    //if(IsDebug.isDebug) { System.out.println("  [ScorerPopulation: scavenged 'clone' Scorer: " + _scorerScores.get(worse) + ".]"); }
865                }
866    
867            // If we didn't manage to evict more than one Scorer with out subtle weeding
868            // (which the quick-and-easy default non-scavenge mechanism would do anyway)
869            // then run an increasingly harsh filter until we fill our quota
870            // so as to pay for the time spent scavenging so far.
871            // We still will never evict the best Scorer,
872            // nor more than a small subset of all Scorers,
873            // and we still start evicting the poorest Scorers overall,
874            // so at worst this is probably similar to having a somewhat smaller maximum population size.
875            if(evictees.size() <= 1)
876                {
877                // Require adjacent Scorers by 'goodness' to be less and less similar to survive each pass
878                // until we manage to force enough Scorers out to give lots of elbow room for new entrants
879                // or all remaining Scorers really seem (genotypically) different
880                // (or we are effectively just forcing Scorers out by lottery).
881                final int nParams = getScorerInstance(subPop.first()).getParameterDefsAndValues().size();
882                // Start with small steps, ie using the finest razor available,
883                // but limit the maximum number of passes even for large parameter sets,
884                // to very approximately O(log(nParams)) unless we make good progress.
885                // We double the step if we did not find new evictees on the previous pass,
886                // else if we did clear some then we set the step back to 1 for maximum resolution.
887                boolean clearedOnLastPass; // True if we cleared anything on the last pass.
888                int step = 1; // Current similarity (decrease) step; strictly positive.
889                for(int similarity = nParams; (similarity > 0) && (evictees.size() < maxPurgeCount); similarity -= step, step = (clearedOnLastPass ? 1 : (2*step)))
890                    {
891    //if(IsDebug.isDebug) { System.out.println("_scavenge(): nParams="+nParams+", similarity="+similarity+", step="+step+", evictees.size()="+evictees.size()); }
892                    clearedOnLastPass = false;
893                    for(final Iterator<String> it = subPop.iterator(); it.hasNext() && (evictees.size() < maxPurgeCount); )
894                        {
895                        // Get 'less good' of the two Scorers that we will examine.
896                        final String worse = it.next();
897    
898                        // Never try and scavenge/remove the best (last) Scorer in this sub-population...
899                        if(!it.hasNext()) { break; }
900                        // Never try and remove the base/parameterless Scorer from this sub-population.
901                        if(worse.indexOf(AbstractScorer.SEPARATOR) == -1) { continue; }
902                        // Skip any Scorer already waiting to be purged...
903                        if(evictees.contains(worse)) { continue; }
904    
905                        // Get the 'better' of the two Scorers compared this time.
906                        assert(it.hasNext()); // We know from earlier test that there is at least one more Scorer.
907                        final String better = it.next();
908                        // Skip any Scorer already waiting to be purged...
909                        if(evictees.contains(better)) { continue; }
910    
911                        // Check for genotypic similarity...
912                        // Don't discard a 'worse' Scorer sufficiently different to its 'better'.
913                        if(!AbstractScorer.similarNParams(getScorerInstance(worse), getScorerInstance(better), similarity)) { continue; }
914    
915                        // Mark this Scorer for removal (and push into the back-stop cache immediately).
916                        evictees.add(worse);
917                        _backstopLRU.put(worse, _scorerScores.get(worse));
918                        clearedOnLastPass = true;
919    //if(IsDebug.isDebug) { System.out.println("  [ScorerPopulation: scavenged near ("+similarity+") 'clone' Scorer: " + _scorerScores.get(worse) + ".]"); }
920                        }
921                    }
922                }
923    
924            // Now efficiently purge all the evictees (if any) en masse.
925            if(!evictees.isEmpty())
926                {
927    //if(IsDebug.isDebug) { System.out.println("  [ScorerPopulation: scavenged "+evictees.size()+'/'+_scorerScores.size()+" Scorers at "+((System.currentTimeMillis()-startTime)/evictees.size())+"ms/Scorer.]"); }
928                // Must remove from sub-population before removing from _scorerScores.
929                subPop.removeAll(evictees);
930                _scorerScores.keySet().removeAll(evictees);
931                }
932            }
933    
934        /**Clear any caches which depend on the population state remaining unchanged.
935         * This clearing is done within the lock that protects the internal data structures
936         * (even though the caches can be safely accessed outside the lock)
937         * to minimise unwanted races (and eliminate them for lock-protected cache accesses).
938         */
939        private void _clearCaches()
940            {
941            synchronized(_lock)
942                {
943                // Clear any caches which may be otherwise invalid after an update.
944                _gCSWP_cache = null; // Will need to recompute the "best" Scorer set.
945                _gBS_cache.clear(); // Will need to recompute any breeding set.
946                }
947            }
948    
949        /**If true, allow posting the global event for exchange of the best/new Scorers.
950         * The mechanism must also be enabled in the GenProps for this to happen.
951         */
952        private static final boolean USE_GLOBAL_EVENTS_ALSO = true;
953    
954        /**Get current population size; non-negative.
955         * This value may slightly exceed the requested maximum.
956         * <p>
957         * This does not include the count of any entries in the back-stop LRU (negative) cache.
958         *
959         * @return count of unique Scorers that might be selected as part of a "breeding set"
960         */
961        public int size() { return(_scorerScores.size()); }
962    
963        /**Return the target maximum population size; non-negative.
964         * The actual population size may exceed this somewhat; this is a target.
965         */
966        public int maxSize() { return(maxPopulationSize); }
967    
968        /**Cache of result from getCurrentScorersWithParameters(); may be null if nothing cached.
969         * Initially null, and cleared on any update (put) to the population.
970         * <p>
971         * Any value cached here must be immutable for safety and sanity.
972         * <p>
973         * Marked volatile for thread-safe lock-free access.
974         */
975        private volatile Set<String> _gCSWP_cache;
976    
977        /**Current immutable set of available Scorer names and parameters (where applicable); never null but may be empty.
978         * The values returned are of the form <i>ScorerName{:name=value}*</i>.
979         * <p>
980         * The Scorers returned by this will generally be the best available,
981         * suitable for making a real prediction of and exhibit score,
982         * and will usually be the best one or handful per base-Scorer type,
983         * allowing that different Scorer types may have different "skill" domains,
984         * and excluding any with non-positive goodness.
985         * <p>
986         * This routine may be expensive, though will try to be efficient.
987         */
988        public Set<String> getCurrentScorersWithParameters(final boolean allowStale)
989            {
990            // Check the cache first.
991            // The cache is cleared by any put/update of the population.
992            final Set<String> cachedValue = _gCSWP_cache;
993            if(_gCSWP_cache != null) { return(cachedValue); }
994    
995            // Result set, of only +ve goodness Scorers.
996            final Set<String> r = new HashSet<String>(4 * baseScorers.size());
997    
998            // Hold a lock on the data structures while working.
999            synchronized(_lock)
1000                {
1001                // Attempt to a candidate from each base type (sub population).
1002                for(final NavigableSet<String> subPop : _scorerByGoodness.values())
1003                    {
1004                    if(!subPop.isEmpty())
1005                        {
1006                        // Have the number of samples from each base name/type to include >= 1
1007                        // (zero iff there are no eligible candidates at all)
1008                        // and growing only (very) slowly with the population size.
1009                        // In any case we try to return a diverse group from each sub-population.
1010                        // We use log(log(n)) to allow for the possibility of multiple distinct useful candidates
1011                        // in very large populations, without wasting too much time on many near clones.
1012                        final int maxToInclude = Math.max(1, GenUtils.log2Approx(Math.max(1, GenUtils.log2Approx(subPop.size()))));
1013    
1014                        // Get the best Scorer of this type and its "goodness".
1015                        final String best = subPop.last();
1016                        final int bGoodness = ScoreAndConf.computeScorerGoodness(_scorerScores.get(best).second);
1017                        if(bGoodness <= 0) { continue; /* All bad! */ }
1018                        // We'll accept other Scorers nearly as good as the best,
1019                        // but NOT of identical goodness or otherwise too similar (pheno/genotypically)
1020                        // as we want (useful) variety in what we return, eg for multimodal matching.
1021                        final int threshold = bGoodness/2;
1022                        // Set of goodnesses so far: we don't return two identical.
1023                        final Set<Integer> goodnesses = new HashSet<Integer>(2 * maxToInclude);
1024                        // Set of scores so far: we don't return two identical.
1025                        final Set<Short> scores = new HashSet<Short>(2 * maxToInclude);
1026                        // Set of Scorer instances so far; we don't return two 'very similar'.
1027                        final Collection<ScorerIF> instances = new ArrayList<ScorerIF>(maxToInclude);
1028                        // Work backwards from the best (which is always included)...
1029                        int remaining = maxToInclude;
1030                        main: for(final String snp : subPop.descendingSet())
1031                            {
1032                            final ScoreAndConf sac = _scorerScores.get(snp).second;
1033                            final Short score = sac.score;
1034                            if(scores.contains(score)) { continue; /* Identical to a previous score. */ }
1035                            scores.add(sac.score);
1036                            final int g = ScoreAndConf.computeScorerGoodness(sac);
1037                            // Stop if the goodness falls too far (or to/below zero).
1038                            if(g <= threshold) { break; }
1039                            assert(g > 0); // Only keeping Scorers with merit.
1040                            final Integer G = g;
1041                            if(goodnesses.contains(G)) { continue; /* Identical to a previous goodness. */ }
1042                            goodnesses.add(G);
1043                            // Make sure that this Scorer is not of very similar genotype to a previous one.
1044                            final ScorerIF sc = getScorerInstance(snp);
1045                            assert(sc != null);
1046                            // We require increasingly-different Scorer instances as we gather more Scorers.
1047                            final int maxSimilarity = Math.max(0, sc.getParameterDefsAndValues().size() - instances.size());
1048                            for(final ScorerIF prevSc : instances)
1049                                { if(AbstractScorer.similarNParams(prevSc, sc, maxSimilarity)) { continue main; /* Similar genotype to extant returnees. */ } }
1050                            // Record new genetically-different Scorer.
1051                            instances.add(sc);
1052                            // This Scorer is good enough to return.
1053                            r.add(snp);
1054                            // Stop if we've collected enough.
1055                            if(--remaining <= 0) { break; }
1056                            }
1057                        assert(r.contains(best));
1058                        }
1059                    }
1060                }
1061    
1062            // We avoid cacheing empty results to assist with fast bootstrapping.
1063            if(r.isEmpty()) { return(Collections.emptySet()); }
1064            // Wrap the result to make it immutable, and cache it (unless empty).
1065            final Set<String> result = Collections.unmodifiableSet(r);
1066            _gCSWP_cache = result;
1067    
1068            return(result);
1069            }
1070    
1071        /**Cache of results for getBreedingSet(); never null.
1072         * Cleared by any put/update operation on the population.
1073         * <p>
1074         * Thread-safe map from (valid) baseName to breeding set.
1075         */
1076        private final ConcurrentMap<String, List<String>> _gBS_cache = new ConcurrentHashMap<String, List<String>>();
1077    
1078        /**Get immutable random-access list of Scorers to "breed" from, best first; never null, possibly empty.
1079         * This returns all of (or some of the best of) the Scorers with the given base name.
1080         * <p>
1081         * This generally trims away all Scorers with non-positive values,
1082         * but will always try to return the few best items regardless,
1083         * ie the result is not completely empty unless the specified sub-population is.
1084         * <p>
1085         * The result will generally be large if the population is.
1086         */
1087        public List<String> getBreedingSet(final String baseName)
1088            {
1089            // Validate the base name first.
1090            if(baseName == null) { throw new IllegalArgumentException(); }
1091            if(!baseScorers.keySet().contains(baseName)) { throw new IllegalArgumentException(); }
1092    
1093            final List<String> cachedValue = _gBS_cache.get(baseName);
1094            if(cachedValue != null) { return(cachedValue); }
1095    
1096            // Hold lock while accessing mutable data.
1097            synchronized(_lock)
1098                {
1099                // Get in "worst-first" order.
1100                final ArrayList<String> r = new ArrayList<String>(_scorerByGoodness.get(baseName));
1101                // Convert to "best-first" order.
1102                Collections.reverse(r);
1103                final int size = r.size();
1104                final int limit = size/4;
1105                // Eliminate all those with non-positive goodness if possible,
1106                // but don't remove the best ~25% however poor they may appear to be
1107                // else we will not have any material to breed (and try to improve) from.
1108                for(int i = size; --i > limit; )
1109                    {
1110                    final int goodness = ScoreAndConf.computeScorerGoodness(_scorerScores.get(r.get(i)).second);
1111                    if(goodness > 0) { break; }
1112                    // Remove this trailing zero-goodness item.
1113                    r.remove(i);
1114                    }
1115                // Result must not be empty unless the sub-population is.
1116                assert(_scorerByGoodness.get(baseName).isEmpty() || !r.isEmpty());
1117    
1118                // Wrap the result for immutablity and cache it.
1119                final List<String> result = Collections.unmodifiableList(r);
1120                _gBS_cache.putIfAbsent(baseName, result); // Reduce chance of update races.
1121    
1122                return(result);
1123                }
1124            }
1125    
1126        /**Interface of call-back hook to report a new "best" Scorer by name, eg to the system event record.
1127         */
1128        public interface NewBestCallbackIF
1129            {
1130            /**Used to post a new "best of breed" Scorer name and parameters for posterity.
1131             * This routine should not throw any exceptions.
1132             * @param scorerNameAndParameters  valid name-and-parameters set; never null.
1133             */
1134            public void reportNewBestScorer(final String scorerNameAndParameters);
1135            }
1136    
1137    
1138        /**Small thread-safe LRU cache for getScorerInstance(); never null.
1139         * Intended to avoid repeated re-parsing of args and recreation of instances, etc,
1140         * for very-commonly-used Scorers used to evaluate every exhibit.
1141         * Usually we'd expect this to be a handful actually needed regularly,
1142         * but while hunting for new solutions (or scavenging for space)
1143         * we'll expect to churn this cache quite hard.
1144         */
1145        private final SimpleLRUMapAutoSizeForHitRate<String, ScorerIF> _gSI_cache;
1146    
1147        /**Get Scorer instance given the Scorer{:value=name}* format; null if no such Scorer available.
1148         * Any instance returned may be a shared/cached instance rather than a new instance.
1149         * <p>
1150         * This does not attempt to canonicalise the names it is given,
1151         * so may create and cache duplicates for example.
1152         * <p>
1153         * This can be used to create any valid Scorer instance,
1154         * not just entries in the given population.
1155         */
1156        public ScorerIF getScorerInstance(final String nameAndParameters)
1157            {
1158            if(nameAndParameters == null) { throw new IllegalArgumentException(); }
1159    
1160            // Try lookup in the cache; return a cached result without further ado!
1161            final ScorerIF cached = _gSI_cache.get(nameAndParameters);
1162            if(cached != null) { return(cached); }
1163    
1164            // Quick extraction of base name.
1165            final int fs = nameAndParameters.indexOf(AbstractScorer.SEPARATOR);
1166            final boolean hasParams = (fs != -1);
1167            final String baseName = hasParams ? nameAndParameters.substring(0, fs) : nameAndParameters;
1168    
1169            // Get a base instance of the Scorer, if any.
1170            final ScorerIF sc = baseScorers.get(baseName);
1171            // If this base name is not known/supported, stop now.
1172            if(sc == null) { return(null); }
1173    
1174            // Fast-track the simple no-parameters case.
1175            if(!hasParams) { return(sc); }
1176    
1177            try
1178                {
1179                // Use base non-parameterised instance to create variant.
1180                final ScorerIF newInstance = sc.createVariant(nameAndParameters);
1181                assert(newInstance != null);
1182                // Cache and return this value.
1183                _gSI_cache.put(nameAndParameters, newInstance);
1184                return(newInstance);
1185                }
1186            // Handle a simple argument-format error by propagating up to caller as usual.
1187            catch(final IllegalArgumentException e) { throw e; }
1188            // Log and propagate any other error.
1189            catch(final Exception e) { e.printStackTrace(); throw new Error(e); }
1190            }
1191    
1192        }