001    /*
002    Copyright (c) 1996-2011, Damon Hart-Davis
003    All rights reserved.
004    
005    Redistribution and use in source and binary forms, with or without
006    modification, are permitted provided that the following conditions are
007    met:
008    
009      * Redistributions of source code must retain the above copyright
010        notice, this list of conditions and the following disclaimer.
011    
012      * Redistributions in binary form must reproduce the above copyright
013        notice, this list of conditions and the following disclaimer in the
014        documentation and/or other materials provided with the
015        distribution.
016    
017    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
018    IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
019    TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
020    PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
021    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
022    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
023    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
024    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
025    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
026    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
027    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
028    */
029    
030    package org.hd.d.pg2k.ai.scorer;
031    
032    import java.io.IOException;
033    import java.util.ArrayList;
034    import java.util.Collection;
035    import java.util.Collections;
036    import java.util.Comparator;
037    import java.util.HashMap;
038    import java.util.HashSet;
039    import java.util.Iterator;
040    import java.util.List;
041    import java.util.Map;
042    import java.util.NavigableSet;
043    import java.util.Set;
044    import java.util.SortedSet;
045    import java.util.TreeSet;
046    import java.util.concurrent.ConcurrentHashMap;
047    import java.util.concurrent.ConcurrentMap;
048    import java.util.concurrent.atomic.AtomicInteger;
049    
050    import org.hd.d.pg2k.svrCore.GenUtils;
051    import org.hd.d.pg2k.svrCore.MemoryTools.SimpleLRUMapAutoSizeForHitRate;
052    import org.hd.d.pg2k.svrCore.SimpleLoggerIF;
053    import org.hd.d.pg2k.svrCore.Tuple.Pair;
054    import org.hd.d.pg2k.svrCore.datasource.SimpleExhibitPipelineIF;
055    import org.hd.d.pg2k.svrCore.vars.EventPeriod;
056    import org.hd.d.pg2k.svrCore.vars.SimpleVariableValue;
057    import org.hd.d.pg2k.svrCore.vars.SystemVariables;
058    
059    import ORG.hd.d.IsDebug;
060    
061    
062    /**Thread-safe container for Scorer-and-parameter information to maintain an evolving population.
063     * This separates Scorers by basic (parameterless) name,
064     * on the grounds that they are not directly comparable.
065     * <p>
066     * This has limited capacity (so as to cap resource use),
067     * and discards the least good parameterised Scorers as needed
068     * to make way for new (better) instances.
069     * (These discarded less-good Scorers may have their results retained
070     * in a smalled secondary "negative" cache to speed up re-testing.)
071     * <p>
072     * This has the automatially persists/announces/shares new/good instances
073     * using the system variables events mechanism, local and global.
074     * <p>
075     * TODO: in case of accute memory shortage abandon least-good Scorers
076     */
077    public final class ScorerPopulation
078        {
079        /**Construct with a list of the base Scorers (names, no parameters) to keep track of.
080         * Note that we may slightly exceed the maximum specified population size for expediency,
081         * and we may dump some (less good) part of the population under memory stress.
082         *
083         * @param baseScorers  non-null, non-empty, pure-base-name seed set
084         * @param maxPopulationSize  maximum number of Scorer details to retain in memory,
085         *     possibly excluding the fixed/base Scorers, strictly positive;
086         *     this value is advisory and may be slightly exceeded for performance
087         * @param callback  used to report new best-of-breed being inserted into cache;
088         *     may be null
089         */
090        public ScorerPopulation(final Map<String, ScorerIF> baseScorers,
091                                final NewBestCallbackIF callback,
092                                final int maxPopulationSize)
093            {
094            this.callback = callback;
095    
096            if(baseScorers == null) { throw new IllegalArgumentException(); }
097            // Store immutable (defensive) copy of base Scorer names/instances.
098            this.baseScorers = Collections.unmodifiableMap(new HashMap<String, ScorerIF>(baseScorers));
099            if(this.baseScorers.isEmpty()) { throw new IllegalArgumentException(); }
100            // Validate the Scorer names and instances.
101            for(final String n : this.baseScorers.keySet())
102                {
103                // Validate the name, and make sure that there are no parameters.
104                if(!AbstractScorer.parseNameAndParameters(n).second.isEmpty())
105                    { throw new IllegalArgumentException("no parameters allowed on seed set"); }
106                // Ensure that the name corresponds to the actual Scorer instance supplied.
107                final ScorerIF scorer = baseScorers.get(n);
108                if((scorer == null) || !n.equals(scorer.getBaseName()))
109                    { throw new IllegalArgumentException("mismatch between Scorer base name and instance"); }
110                if(!n.equals(scorer.getNameAndParameters()))
111                    { throw new IllegalArgumentException("supplied Scorer instance was parameterised"); }
112                }
113            // Create the Set of valid classes for quick validation of instances.
114            supportedScorerClasses = new HashSet<Class<? extends ScorerIF>>(2 * this.baseScorers.size());
115            for(final ScorerIF sc : this.baseScorers.values())
116                { supportedScorerClasses.add(sc.getClass()); }
117    
118            // Must be at least two slots per base Scorer type/name
119            // (to allow for the base/parameterless value and one 'optimal' value).
120            if(maxPopulationSize <= 2*this.baseScorers.size()) { throw new IllegalArgumentException("population too small"); }
121            this.maxPopulationSize = maxPopulationSize;
122    
123            // Size to accommodate the specified capacity without further expansion.
124            // We don't expect much write concurrency.
125            // We use a lowish load-factor to enhance read and write performance.
126            _scorerScores = new ConcurrentHashMap<String, Pair<Long, ScoreAndConf>>(2*maxPopulationSize, 0.5f, 4);
127    
128            // The comparator arranges for the items to be sorted by increasing "goodness".
129            // Because of a lack of symmetry in scoring
130            // (ie reversing the prediction of a bad Scorer does not necessarily make for a good prediction)
131            // we are most interested in the highest-confidence, positive-correlation Scorers.
132            SCORER_RANK_COMPARATOR = new Comparator<String>()
133                {
134                // This treats missing items from _scorerScores as neutral (goodness 0).
135                public final int compare(final String a, final String b)
136                    {
137                    final ScoreAndConf sacA = _scorerScores.get(a).second;
138                    final ScoreAndConf sacB = _scorerScores.get(b).second;
139                    // Primary sort by goodness.
140                    final int goodnessA = (sacA == null) ? 0 : ScoreAndConf.computeScorerGoodness(sacA);
141                    final int goodnessB = (sacB == null) ? 0 : ScoreAndConf.computeScorerGoodness(sacB);
142                    if(goodnessA < goodnessB) { return(-1); }
143                    if(goodnessA > goodnessB) { return(+1); }
144                    if((sacA != null) && (sacB != null))
145                        {
146                        // Secondary sort by score (if both scores available).
147                        if(sacA.score < sacB.score) { return(-1); }
148                        if(sacA.score > sacB.score) { return(+1); }
149                        }
150                    // Break ties by name to ensure a total ordering.
151                    return(a.compareTo(b));
152                    }
153                };
154            // Create the (immutable) outer map with empty sub-maps for _scorerByGoodness.
155            final Map<String, NavigableSet<String>> sbg = new HashMap<String, NavigableSet<String>>(2 * baseScorers.size());
156            for(final String n : this.baseScorers.keySet())
157                { sbg.put(n, new TreeSet<String>(SCORER_RANK_COMPARATOR)); }
158            _scorerByGoodness = Collections.unmodifiableMap(sbg);
159    
160            // Backup LRU cache, mainly for "bad" Scorer results.
161            // This also accommodates 'clones' evicted from the main population.
162            // This can be cleared entirely during extreme memory stress.
163            final int _backstopLRUMaxSize = 1 + maxPopulationSize/2;
164            _backstopLRU = SimpleLRUMapAutoSizeForHitRate.<String, Pair<Long, ScoreAndConf>>create(0, _backstopLRUMaxSize, "_backstopLRU");
165    
166            // Create Scorer lookup cache, size limited, in proportion to max population size.
167            // This allows us to hold in cache every live Scorer in the main population.
168            // This assumes that each Scorer instance does not consume much memory/resource.
169            // Since we may well need to churn through the entire population sequentially,
170            // eg during scavenging or possibly while selecting a breeding set,
171            // this cache should be significantly larger than the population size to be of most value.
172            // This can be cleared entirely during extreme memory stress.
173            _gSI_cache = SimpleLRUMapAutoSizeForHitRate.<String, ScorerIF>create(0, 1 + ((3*maxPopulationSize)/2), "_gSI_cache");
174            }
175    
176        /**Comparator used to rank Scorers relative to one another by increasing goodness, never null.
177         * Scorer values not currently represented in the population
178         * are treated as if goodness 0 (ie neutral).
179         * <p>
180         * If both goodness values are available then a secondary ranking is done
181         * by increasing score.
182         * <p>
183         * Complete ties are broken by Scorer name.
184         */
185        public final Comparator<String> SCORER_RANK_COMPARATOR;
186    
187        /**If true then we may kick out apparent-near duplicates as the population reaches maximum.
188         * The aim of doing this is to increase diversity of the 'gene pool' for a given capacity,
189         * by making space for significantly different genotypes
190         * and avoiding them being swamped or pushed out by many near-clones of the overall best Scorer
191         * in any given sub-population.
192         * <p>
193         * This scavenging may have a random element.
194         * <p>
195         * Scavenging may be quite slow.
196         */
197        private static final boolean SCAVENGE_FOR_DIVERSITY = true;
198    
199        /**Reset/clear the population to its initial (empty) state.
200         * Does not shut out all other activity,
201         * so it is possible that new items may be inserted while this is running.
202         */
203        public void clear()
204            {
205            // First clear the back-stop cache.
206            _backstopLRU.clear();
207            // Now systematically clear the main data stores and caches under the data lock.
208            synchronized(_lock)
209                {
210                _clearCaches();
211                _scorerScores.clear();
212                for(final NavigableSet<String> nm : _scorerByGoodness.values())
213                    { nm.clear(); }
214                }
215            // Finally clear out the Scorer instance cache.
216            _gSI_cache.clear();
217            }
218    
219        /**Construct with a list of the base Scorers (names, no parameters) to keep track of.
220         * Note that we may slightly exceed the maximum specified population size for expediency,
221         * and we may dump some (less good) part of the population under memory stress.
222         * <p>
223         * This automatically posts new best-of-breed Scorers to the system variables event store
224         * as they are cached.
225         *
226         * @param baseScorers  non-null, non-empty, pure-base-name seed set
227         * @param maxPopulationSize  maximum number of Scorer details to retain in memory,
228         *     possibly excluding the fixed/base Scorers, strictly positive;
229         *     this value is advisory and may be slightly exceeded for performance
230         * @param vars  system variables pipeline to record new/best Scorer events to
231         */
232        public ScorerPopulation(final Map<String, ScorerIF> baseScorers,
233                                final int maxPopulationSize,
234                                final SimpleExhibitPipelineIF vars)
235            {
236            this(baseScorers,
237                 (new NewBestCallbackIF() {
238                    public final void reportNewBestScorer(final String nameAndParameters)
239                        {
240                        final SimpleVariableValue svvL = new SimpleVariableValue(SystemVariables.AI_SCORER_STRING_LOCAL_EVENT,
241                                                                                nameAndParameters);
242                        try { vars.setVariable(svvL); }
243                        catch(final IOException e) { e.printStackTrace(); /* Complain but continue after error. */ }
244                        try
245                            {
246                            if(USE_GLOBAL_EVENTS_ALSO &&
247                                    "true".equalsIgnoreCase(vars.getGenProps(-1).getGen().get(GEN_PROPS_GLOBAL_POST_ENABLE_SUFFIX)))
248                                {
249                                final SimpleVariableValue svvG = new SimpleVariableValue(SystemVariables.AI_SCORER_STRING_GLOBAL_EVENT,
250                                        nameAndParameters);
251                                try { vars.setVariable(svvG); }
252                                catch(final IOException e) { e.printStackTrace(); /* Complain but continue after error. */ }
253                                }
254                            }
255                        catch(final IOException e) { e.printStackTrace(); /* Complain but continue after error. */ }
256                        }
257                    }),
258                 maxPopulationSize);
259            }
260    
261        /**Maximum number of Scorer details to retain, possibly excluding the fixed/base Scorers, strictly positive.
262         * Must in fact be large than the number of base Scorer types,
263         * since we will need at least once slot for each non-parameterised type.
264         */
265        private final int maxPopulationSize;
266    
267        /**Immutable Map from base Scorer names (no parameters) to base/parameterless instances to monitor/maintain; non-null, non-empty. */
268        private final Map<String, ScorerIF> baseScorers;
269    
270        /**Set of class values for the supported Scorer types for fast validation; never null. */
271        private final Set<Class<? extends ScorerIF>> supportedScorerClasses;
272    
273        /**Callback to report new best-of-breed being inserted into cache; may be null. */
274        private final NewBestCallbackIF callback;
275    
276        /**A back-stop LRU cache of Scorer goodness, so that in particular the results of poor Scorers need not be excessively recomputed; never null.
277         * Normally we only keep the ScoreAndConf of the best Scorers,
278         * but if we happen to be repeatedly asked for the results of poor Scorers
279         * and we don't hold those results because they are not good to be in the main population
280         * then we may simply waste disproportionate time recomputing them.
281         * <p>
282         * This aims to be a bonus back-stop cache of all results not in the main cache,
283         * and is not strictly part of the counted population
284         * because these are not part of the potential "breeding set":
285         * they are here only for "negative cacheing" of poor Scorer parameters sets.
286         * <p>
287         * We keep it much smaller than the main population
288         * so that it is not a significant resource drain.
289         * <p>
290         * We push Scorers into it when they are displaced from the main population.
291         * <p>
292         * Thread-safe.
293         * <p>
294         * It is permissible and viable to hold a lock on this object over multiple operations
295         * to make them into one compound atomic operation.
296         */
297        private final SimpleLRUMapAutoSizeForHitRate<String, Pair<Long, ScoreAndConf>> _backstopLRU;
298    
299        /**Non-user-visible  lock to serialise activity on _scorerScores and _scorerByGoodness; never null.
300         * No blocking/external calls are made in the scope of this lock
301         * so as to ensure that it is held only for very short intervals.
302         */
303        private final Object _lock = new Object();
304    
305        /**Thread-safe cache of Scorer scores (and time when cached), indirectly size-limited; never null.
306         * Map from scorer name-and-parameters to ScoreAndConfidence and the time computed/cached.
307         * <p>
308         * Single get() accesses to this can be made safely without further locking
309         * so as to maximise concurrency.
310         * <p>
311         * Note that since a put() operation does not lock out get() operations,
312         * any updates on the main _scorerScores map must be incrementally valid,
313         * ie incorrect data must never be inserted into the _scorerScores map.
314         */
315        private final ConcurrentMap<String, Pair<Long, ScoreAndConf>> _scorerScores;
316    
317        /**Map from base Scorer name to a least-good-first NavigableSet of full nameAndParameters; never null.
318         * We pre-populate this with empty maps under each base name that we handle
319         * (so it is never necessary to test for presence in, or ever alter, this outer map; hence it is immutable).
320         * <p>
321         * The top-level Comparator is selected to sort by goodness derived from the _scorerScores,
322         * so any extant entry in the nameAndParameters should be removed from the _scorerByGoodness map
323         * before removal from or replacement in the _scorerScores map,
324         * and re-inserted afterwards as necessary.
325         * <p>
326         * Since the different base Scorer names/types are held separately,
327         * this helps keep the probably-distinct populations from competing with one another,
328         * though the overall population size cap is still present.
329         * <p>
330         * Because of a potential lack of symmetry in scoring
331         * (ie reversing the prediction of a bad Scorer does not necessarily make it a good one)
332         * we are interested in preserving the highest-confidence, positive-correlation Scorers.
333         * <p>
334         * Not thread-safe; access only under _lock.
335         */
336        private final Map<String, NavigableSet<String>> _scorerByGoodness;
337    
338        /**ScoreAndConfidence for the given Scorer over all exhibit types; null if not known.
339         * Never recomputes, always fetches from its cache if present, else returns null.
340         * <p>
341         * Older results (typically up to 1 day older) may be returned if allowStale is true;
342         * this may help the system overall ride out temporary I/O problems and disconnections.
343         * <p>
344         * These get operations are generally fast,
345         * though some cycle-stealing incremental housekeeping might be performed on this call.
346         *
347         * @param scorer  the Scorer instance; never null
348         * @param allowStale  if true then allow an older cache response
349         *     (up to VLONG period older)
350         *
351         * @return  the score represents the correlation with the underlying votes
352         *     (and whatever the scoring is measured against)
353         *     with MAX meaning perfect correlation, 0 meaning no correlation,
354         *     and -MAX meaning perfectly wrong answers all the time,
355         *     and the confidence 0 if we have no (or very few) data points yet
356         *     and approaching MAX as we have a large (enough) number of data points
357         *
358         * @throws IllegalArgumentException  if the Scorer is null
359         */
360        public ScoreAndConf getScorerWeighting(final ScorerIF scorer,
361                                               final boolean allowStale)
362            {
363            if(scorer == null) { throw new IllegalArgumentException(); }
364            return(getScorerWeighting(scorer.getNameAndParameters(), allowStale));
365            }
366    
367        /**ScoreAndConfidence for the given Scorer over all exhibit types; null if not known.
368         * Never recomputes, always fetches from its cache if present, else returns null.
369         * <p>
370         * Older results (typically up to 1 day older) may be returned if allowStale is true;
371         * this may help the system overall ride out temporary I/O problems and disconnections.
372         * <p>
373         * These get operations are generally fast,
374         * though some cycle-stealing incremental housekeeping might be performed on this call.
375         *
376         * @param scorerNameAndParameters  the name and parameters of the Scorer; never null
377         * @param allowStale  if true then allow an older cache response
378         *     (up to VLONG period older)
379         *
380         * @return  the score represents the correlation with the underlying votes
381         *     (and whatever the scoring is measured against)
382         *     with MAX meaning perfect correlation, 0 meaning no correlation,
383         *     and -MAX meaning perfectly wrong answers all the time,
384         *     and the confidence 0 if we have no (or very few) data points yet
385         *     and approaching MAX as we have a large (enough) number of data points
386         */
387        public ScoreAndConf getScorerWeighting(final String scorerNameAndParameters,
388                                               final boolean allowStale)
389            {
390            if(scorerNameAndParameters == null) { throw new IllegalArgumentException(); }
391    
392            final long currentPeriod = EventPeriod.VLONG.getIntervalNumber(System.currentTimeMillis());
393    
394            // We don't need to validate the requested name (other than as being non-null)
395            // since an invalid value will correctly return null.
396            // We don't need to take any locks for this single get();
397            // this helps maximise concurrency and can proceed
398            // even when a long/slow put() operation is taking place in another thread.
399            final Pair<Long, ScoreAndConf> cachedValue = _scorerScores.get(scorerNameAndParameters);
400            // We can use a value from cache if the cached value is from the current period
401            //     (or the previous period if allowStale is true).
402            if((cachedValue != null) &&
403               (EventPeriod.VLONG.getIntervalNumber(cachedValue.first) >= (allowStale ? currentPeriod-1 : currentPeriod)))
404                { return(cachedValue.second); }
405    
406            // Try back-stop LRU cache.
407            // We access this out of the scope of the main lock to maximise concurrency.
408            // Note that we allow use of older values than from the main population.
409            // We actively remove expired items to make more room in this back-stop cache.
410            synchronized(_backstopLRU)
411                {
412                final Pair<Long, ScoreAndConf> backstopResult = _backstopLRU.get(scorerNameAndParameters);
413                if(backstopResult != null)
414                    {
415                    final long oldestRetained = currentPeriod-2;
416                    final long cachedItemAge = EventPeriod.VLONG.getIntervalNumber(backstopResult.first);
417                    if(cachedItemAge < oldestRetained)
418                        {
419                        // Remove expired value to make room for others.
420                        _backstopLRU.remove(scorerNameAndParameters);
421                        }
422                    else if(cachedItemAge >= (allowStale ? oldestRetained : currentPeriod))
423                        {
424    //if(IsDebug.isDebug) { System.out.println("[ScorerPopulation.getScorerWeighting(): caught lookup with back-stop LRU cache.]"); }
425                        return(backstopResult.second);
426                        }
427                    }
428                }
429    
430            // Nothing in either cache...
431            return(null);
432            }
433    
434        /**GenProps 'generic' parameter that must be set for us to post to the global event store. */
435        public static final String GEN_PROPS_GLOBAL_POST_ENABLE_SUFFIX = "ai.scorerGlobalPostEnable";
436    
437        /**Save the ScoreAndConfidence for the given Scorer over all exhibit types.
438         * Saves the given Scorer and "goodness",
439         * if necessary evicting a poorly-performing Scorer to make room
440         * ie to limit the population size.
441         * <p>
442         * If the "put" Scorer is very good compared to the existing population members
443         * then it may get persisted/shared via the event mechanism.
444         * <p>
445         * This routine will only consider storing entries for Scorers whose base name
446         * is on the list we were constructed with.
447         * <p>
448         * Note that a put() operation does not lock out get() operations,
449         * so any updates on the main _scorerScores map must be incrementally valid,
450         * ie incorrect data must never be inserted into the _scorerScores map.
451         * <p>
452         * This will attempt to cache the actual Scorer instance provided
453         * which may save time reconstructing it later, and memory by better sharing.
454         * This instance may be recovered with getScorerInstance() later,
455         * though may have to be recreated (ie returning the same instance is never guaranteed).
456         *
457         * @param scorer  the Scorer instance; never null
458         * @param sac  rating of this Scorer across all exhibits; never null
459         * @param dataSource  to record/persist/share good Scorers through;
460         *     null if no Scorer persistence is required
461         * @param log logger; never null
462         *
463         * @return  true if the Scorer result is new for its sub-population and best in it
464         *     and has a positive overall "goodness" (ie has some positive predictive ability)
465         *
466         * @throws IllegalArgumentException  if the Scorer is null or of an unsupported type
467         */
468        public boolean putScorerWeighting(final ScorerIF scorer,
469                                          final ScoreAndConf sac,
470                                          final SimpleExhibitPipelineIF dataSource,
471                                          final SimpleLoggerIF log)
472            {
473            if(scorer == null) { throw new IllegalArgumentException(); }
474            if(!supportedScorerClasses.contains(scorer.getClass())) { throw new IllegalArgumentException("unsupported Scorer class"); }
475            final String nameAndParameters = scorer.getNameAndParameters();
476            _gSI_cache.put(nameAndParameters, scorer);
477            return(putScorerWeighting(nameAndParameters, sac, dataSource, log));
478            }
479    
480        /**Save the ScoreAndConfidence for the given Scorer over all exhibit types.
481         * Saves the given Scorer and "goodness",
482         * if necessary evicting a poorly-performing Scorer to make room
483         * ie to limit the population size.
484         * <p>
485         * If the "put" Scorer is very good compared to the existing population members
486         * then it may get persisted/shared via the event mechanism.
487         * <p>
488         * This routine will only consider storing entries for Scorers whose base name
489         * is on the list we were constructed with.
490         * <p>
491         * Note that a put() operation does not lock out get() operations,
492         * so any updates on the main _scorerScores map must be incrementally valid,
493         * ie incorrect data must never be inserted into the _scorerScores map.
494         *
495         * @param nameAndParameters  the name and parameters of the scorer; never null
496         * @param sac  rating of this Scorer across all exhibits; never null
497         * @param dataSource  to record/persist/share good Scorers through;
498         *     null if no Scorer persistence is required
499         * @param log logger; never null
500         *
501         * @return  true if the Scorer result is new for its sub-population and best in it
502         *     and has a positive overall "goodness" (ie has some positive predictive ability)
503         */
504        public boolean putScorerWeighting(final String nameAndParameters,
505                                          final ScoreAndConf sac,
506                                          final SimpleExhibitPipelineIF dataSource,
507                                          final SimpleLoggerIF log)
508            {
509            if(sac == null) { throw new IllegalArgumentException(); }
510            if(nameAndParameters == null) { throw new IllegalArgumentException(); }
511    
512            final Pair<String, Map<String, String>> nap = AbstractScorer.parseNameAndParameters(nameAndParameters);
513            final String baseName = nap.first;
514            // Ignore if not one of our recognised base Scorers.
515            if(!baseScorers.keySet().contains(baseName)) { return(false); }
516            final int goodness = ScoreAndConf.computeScorerGoodness(sac);
517            final boolean parameterless = nap.second.isEmpty();
518    
519            final long startTime = System.currentTimeMillis();
520    
521            // Set true if we discover that this is the best Scorer of its type so far
522            // so we should save/share it.
523            boolean isNewAndBest = false;
524    
525            // Still a candidate for storing, so grab a lock...
526            synchronized(_lock)
527                {
528                // Clear caches within the lock scope to avoid/minimise races.
529                _clearCaches();
530    
531                final SortedSet<String> insertedScorerSubPopulation = _scorerByGoodness.get(baseName);
532                // Compute previous 'best' score in this sub-population if any.
533                final int prevBest = insertedScorerSubPopulation.isEmpty() ? Integer.MIN_VALUE :
534                    ScoreAndConf.computeScorerGoodness(_scorerScores.get(insertedScorerSubPopulation.last()).second);
535    
536                // Remove and return any extant value for this Scorer value, null if none previously.
537                final Pair<Long, ScoreAndConf> oldVal = _remove(nameAndParameters);
538    
539                // Insert the value: first in _scorerScores, then in the dependent _scorerByGoodness.
540                // We DO NOT intern() the possibly-long name because doing so would stress the PermGen.
541                _scorerScores.put(nameAndParameters, new Pair<Long, ScoreAndConf>(startTime, sac));
542                insertedScorerSubPopulation.add(nameAndParameters);
543    
544                // Test if this is the best Scorer in its sub-population
545                // and has a non-zero goodness.
546                // Note that this has already been inserted into that sub-population,
547                // so cannot be "better",
548                // so we test against the previous 'best' value if any.
549                assert(goodness <= ScoreAndConf.computeScorerGoodness(_scorerScores.get(insertedScorerSubPopulation.last()).second));
550                if((goodness > 0) && (goodness > prevBest) && (oldVal == null))
551                    { isNewAndBest = true; }
552    
553                // If the population is oversize following this insertion
554                // then we'll cull the worst Scorer from the largest sub-population if possible.
555                if(_scorerScores.size() > maxPopulationSize)
556                    {
557                    // Find the largest sub-population.
558                    int largestSubPopulation = -1;
559                    String largestSubPopulationBasename = null;
560                    for(final String subPopBaseName : _scorerByGoodness.keySet())
561                        {
562                        final int subPopSize = _scorerByGoodness.get(subPopBaseName).size();
563                        if(subPopSize > largestSubPopulation)
564                            {
565                            largestSubPopulation = subPopSize;
566                            largestSubPopulationBasename = subPopBaseName;
567                            }
568                        }
569                    assert(largestSubPopulation > 0);
570                    assert(largestSubPopulationBasename != null);
571    
572                    // Don't attempt scavenging all the time, since it may be very expensive, ie O(n).
573                    // We try to reduce the average time scavenging to about O(1) on average.
574                    // Scavenging is likely to work more effectively per-call if called infrequently.
575                    if(SCAVENGE_FOR_DIVERSITY)
576                        {
577                        // Attempt one scavenging (clone-stripping) run on this (largest) sub-population.
578                        _scavenge(largestSubPopulationBasename);
579                        }
580    
581                    // If scavenging failed to recover any space then evict the least-good Scorer.
582                    if(_scorerScores.size() > maxPopulationSize)
583                        {
584                        // ZAP LEAST-GOOD SCORER SO AS TO CAP THE POPULATION SIZE IF (STILL) OVERSIZE...
585                        // Iterate through the largest population
586                        // culling the first (least good) parameterised Scorer(s) found, if any.
587                        // We may not be able to remove anything, leaving the population slightly oversized.
588                        final SortedSet<String> victimSubPop = _scorerByGoodness.get(largestSubPopulationBasename);
589                        for(final String putativeVictim : victimSubPop)
590                            {
591                            // We never remove the parameterless base Scorer from its sub-population.
592                            if(putativeVictim.indexOf(AbstractScorer.SEPARATOR) == -1) { continue; }
593                            // We push this about-to-be-zapped victim into the back-stop LRU cache...
594                            _backstopLRU.put(putativeVictim, _scorerScores.get(putativeVictim));
595                            // Now remove the victim from the main data stores.
596                            final Pair<Long, ScoreAndConf> zapped = _remove(putativeVictim);
597                            assert(zapped != null); // Must have actually removed the specified 'poor' Scorer.
598                            // Having trimmed the population (and altered the collections) we must now stop.
599                            break;
600                            }
601                        }
602                    }
603                }
604    
605            // If this a new/best +ve-goodness Scorer (in its sub-population) then post it locally
606            // if not or parameterless, and if we have a live dataSource.
607            // This is done outside of the lock scope
608            // to avoid blocking other get/put operations during I/O.
609            // We absorb (and report) any I/O error, but continue.
610            if(isNewAndBest && !parameterless && (callback != null))
611                {
612    if(IsDebug.isDebug) { log.log("[ScorerPopulation: new best Scorer: "+nameAndParameters+" "+sac+", 'goodness'="+goodness+"...]"); }
613                callback.reportNewBestScorer(nameAndParameters);
614                }
615    
616            return(isNewAndBest);
617            }
618    
619        /**Remove and return any extant value for this Scorer value, null if none previously.
620         * Must only be called under the data lock to be atomic wrt the data stores.
621         *
622         * @param nameAndParameters  non-null valid Scorer value incluing optional parameters
623         *
624         * @return  previous timestamp/value for this Scorer, else null if nne
625         */
626        private synchronized Pair<Long, ScoreAndConf> _remove(final String nameAndParameters)
627            {
628            assert(Thread.holdsLock(_lock)); // This lock must be held for safety.
629    
630            // See if we have an existing entry for this nameAndParameters.
631            // (If not, there is nothing to remove.)
632            final Pair<Long, ScoreAndConf> oldVal = _scorerScores.get(nameAndParameters);
633            if(oldVal == null) { return(null); }
634            // If we do, then remove the corresponding entry from _scorerByGoodness first
635            // else we will break its comparator/lookup which depends on _scorerScores.
636            // Quickly extract the base name of the Scorer to select the sub-population.
637            final int firstSep = nameAndParameters.indexOf(AbstractScorer.SEPARATOR);
638            final String baseName = (firstSep == -1) ? nameAndParameters : (nameAndParameters.substring(0, firstSep));
639            // Remove from the appropriate sub-population first...
640            final SortedSet<String> subPopulation = _scorerByGoodness.get(baseName);
641            assert (subPopulation != null);
642            assert (subPopulation.contains(nameAndParameters));
643            subPopulation.remove(nameAndParameters);
644            assert (!subPopulation.contains(nameAndParameters));
645            // Then remove from the main mapping, ie _scorerScores.
646            _scorerScores.remove(nameAndParameters);
647            return(oldVal);
648            }
649    
650        /**Countdown to next scavenge: when zero then scavenge and add on current population size. */
651        private final AtomicInteger countdownToNextScavenge = new AtomicInteger();
652    
653        /**Minimum 'delta' in score for two Scorers to be considered potentially similar; non-negative.
654         * This is the minimum difference in the raw score, not weighted by confidence,
655         * for two Scorers on similar overall goodness to be considered 'similar',
656         * given that we will then further filter by genotype and/or phenotype,
657         * the assumption being that differences of this scale may just be (rounding and other) noise.
658         * <p>
659         * A good value for this is probably less than 1% of maximum,
660         * even though many Scorers will struggle to be much better than chance than this.
661         * <p>
662         * Note that we may apply a tighter test to values very close to zero.
663         * <p>
664         * A value of zero indicates that scores have to be identical
665         * to be considered similar by this measure.
666         */
667        private static final int SIMILAR_SCORE_DELTA = Math.max(2, ScoreAndConf.MAX_GOODNESS >>> 7);
668    
669        /**If true then if a minor scavenge fails always force a major scavenge.
670         * This may be expensive but more-or-less guarantees
671         * that there will always be some space in the population
672         * for marginal but unusual Scorers to have a chance to be bred from.
673         */
674        private static final boolean FORCE_MAJOR_COLLECTION_AFTER_MINOR_FAILS = true;
675    
676        /**Attempt to strip out near-clones from the population to make space for more diversity.
677         * This works on the assumption that there may be many valuable Scorer values
678         * that while not optimal across all exhibits or at this very moment,
679         * may be valuable on sub-sets of the exhibits now or in the near future.
680         * This routine attempts to prevent limited population space (gene-pool capacity)
681         * from being overrun with very slight variations on (effective clones of) a monoculture.
682         * <p>
683         * Currently this is done by elimination of genotypic near-clones
684         * by eliminating those Scorers that have very similar Scores and no sufficiently-distinct genes.
685         * <p>
686         * TODO: In future this may be done by elimination of phenotypic near clones
687         * by eliminating those Scorers that are have the same predction performance over a given calbration set.
688         * <p>
689         * This may not trim out anything from the specified sub-population.
690         * <p>
691         * This will never remove the 'best' item nor the base/parameterless Scorer.
692         * <p>
693         * Values removed by scavenging may be pushed into the back-stop LRU cache
694         * in case they are used again soon by the outside world;
695         * thus we may limit the amount of any scavengine we do in one go
696         * so as to avoid overflowing this back-stop cache.
697         * <p>
698         * This tries a series of more drastic pruning measures until it zaps some/enough Scorers,
699         * with the inital pruning passes being very quick and gentle.
700         * <p>
701         * This does not guarantee to remove any Scorers at all.
702         * <p>
703         * This must be called under the data lock.
704         * <p>
705         * Run time is expected to average about O(1) (peak O(n)) for a size-n sub-population.
706         *
707         * @param largestSubPopulationBasename
708         */
709        private void _scavenge(final String largestSubPopulationBasename)
710            {
711            assert((largestSubPopulationBasename != null) && (_scorerByGoodness.get(largestSubPopulationBasename) != null));
712            assert(Thread.holdsLock(_lock)); // This lock must be held for safety.
713    
714            final long startTime = System.currentTimeMillis();
715            final long currentTick = EventPeriod.VLONG.getIntervalNumber(startTime);
716    
717            // Compute the maximum number of Scorers to attempt to evict in this run.
718            final NavigableSet<String> subPop = _scorerByGoodness.get(largestSubPopulationBasename);
719    
720            // Record the size of the sub-population when we start the scavenge.
721            final int startSubPopSize = subPop.size();
722    
723            // MINOR scavenge limits.
724            // Cap this to a very slow-growing fraction of the sub-population size
725            // so as to keep the mean run-time very close to O(1).
726            final int maxMinorPurgeCount = Math.min(2+GenUtils.log2Approx(startSubPopSize), startSubPopSize-2);
727            // Return immediately if not big enough for us to do even a decent minor scavenge from.
728            if(maxMinorPurgeCount < 2) { return; }
729    
730    //if(IsDebug.isDebug) { System.out.println("[ScorerPopulation: scavenging...  Sub-population size = "+(startSubPopSize)+".]"); }
731    
732            // The set of Scorers that we wish to evict...
733            final Set<String> evictees = new HashSet<String>(16 + 2*maxMinorPurgeCount);
734    
735            // Do MINOR purge...
736            // Remove anything from the extreme tail end (least goodness) part of the sub-population:
737            //   * Without merit (score < 0).
738            //   * Very similar to any item better than it in this tail.
739            //
740            // We copy this always-very-small tail into a new List, worst-first.
741            final List<String> tail = new ArrayList<String>(maxMinorPurgeCount);
742            final Iterator<String> tailIt = subPop.iterator();
743            for(int i = maxMinorPurgeCount; --i >= 0; ) { tail.add(tailIt.next()); }
744            assert(tail.size() == maxMinorPurgeCount);
745            assert(tail.size() < startSubPopSize); // Ensure that we can never zap the 'best' Scorer by mistake.
746            for(int i = 0; i < maxMinorPurgeCount; ++i)
747                {
748                final String n = tail.get(i);
749                // Never try and remove the base/parameterless Scorer from this sub-population.
750                if(n.indexOf(AbstractScorer.SEPARATOR) == -1) { continue; }
751                final ScoreAndConf sac = _scorerScores.get(n).second;
752                // Zap anything without a positive score/confidence.
753                if((sac.score <= 0) || (sac.confidence == 0))
754                    {
755                    evictees.add(n);
756                    _backstopLRU.put(n, _scorerScores.get(n)); // Push this into the back-stop list.
757                    continue;
758                    }
759                // Zap anything very at all to any Scorer better than it in the tail.
760                final ScorerIF sc = getScorerInstance(n);
761                assert(n != null);
762                for(int j = i+1; j < maxMinorPurgeCount; ++j)
763                    {
764                    // We need to be at least a little different to everything else better than us
765                    // if we are to be retained.
766                    if(AbstractScorer.verySimilar(sc, getScorerInstance(tail.get(j))))
767                        {
768                        assert(!n.equals(tail.get(j))); // All Scorers in the tail must be different...
769                        evictees.add(n);
770                        _backstopLRU.put(n, _scorerScores.get(n)); // Push this into the back-stop list.
771                        break;
772                        }
773                    }
774                }
775    
776            // If the minor scavenge identified any poor Scorer
777            // then zap them and avoid a major/slow scavenge if more than one was found.
778            if(!evictees.isEmpty())
779                {
780                // Must remove from sub-population before removing from _scorerScores.
781                subPop.removeAll(evictees);
782                _scorerScores.keySet().removeAll(evictees);
783    
784                // We can avoid a major collection if we did better than simply trimming one entry
785                // which the quick-and-easy default non-scavenge mechanism will do anyway.
786                if(evictees.size() > 1)
787                    {
788    //if(IsDebug.isDebug) { System.out.println("  [ScorerPopulation: minor scavenge successful "+evictees.size()+'/'+startSubPopSize+" Scorers.]"); }
789                    return;
790                    }
791    
792                evictees.clear(); // Don't need to carry these over to the major collection.
793                // Fall through to consider doing a major scavenge...
794                }
795    
796    
797            // MAJOR scavenge limits.
798            // Cap this to a small fraction of the sub-population size so as not to extinguish too much at once.
799            // Cap this also to a fraction of the LRU backstop cache size so as not to overwhelm/flush it.
800            final int maxPurgeCount = Math.min(subPop.size()/8, _backstopLRU.size()/2);
801            // Return immediately if not big enough for us to scavenge from...
802            if(maxPurgeCount < 1) { return; }
803    
804            // After failure of a minor scavenge to make much/any space
805            // we may wish to fall back to our default kill-the-worst-one tactic,
806            // or we may try to force a major collection likely to reclaim a decent chunk immediately.
807            if(!FORCE_MAJOR_COLLECTION_AFTER_MINOR_FAILS)
808                {
809                // Most of the time, quit before slow/major scavenges to keep the mean execution time low.
810                // We only return on > 0 so that this will fail and fall through the first time.
811                if(countdownToNextScavenge.getAndDecrement() > 0) { return; }
812                // Postpone the next major scavenge enough to keep its contribution ~ O(1).
813                countdownToNextScavenge.addAndGet(1+maxPurgeCount);
814                }
815    
816    //if(IsDebug.isDebug) { System.out.println("[ScorerPopulation: scavenging (MAJOR)...  Sub-population size = "+(subPop.size())+".]"); }
817    
818            // Work through the Scorers from worst to best,
819            // stopping when we've found enough candidates.
820            // We examine the Scorers in pairs,
821            // deleiberately leaving at least alternate values untouched on any one pass.
822            for(final Iterator<String> it = subPop.iterator(); it.hasNext() && (evictees.size() < maxPurgeCount); )
823                {
824                // Get 'less good' of the two Scorers that we will examine.
825                final String worse = it.next();
826    
827                // Never try and scavenge/remove the best (last) Scorer in this sub-population...
828                if(!it.hasNext()) { break; }
829                // Never try and remove the base/parameterless Scorer from this sub-population.
830                if(worse.indexOf(AbstractScorer.SEPARATOR) == -1) { continue; }
831    
832                // Get the 'better' of the two Scorers compared this time.
833                assert(it.hasNext()); // We know from earlier test that there is at least one more Scorer.
834                final String better = it.next();
835    
836                // If the worse of the pair is sufficiently old that we wouldn't return it even with allowStale
837                // then dump it immediately (don't even push it into the back-stop cache).
838                // This doesn't have to exactly match the cache behaviour (though doing so is good),
839                // but simply allows us to kick out very old Scorers that have not been re-evaluated at all recently
840                // so may be dead wood consuming valuable space.
841                final Pair<Long, ScoreAndConf> pairW = _scorerScores.get(worse);
842                final long tickWhenCached = EventPeriod.VLONG.getIntervalNumber(pairW.first);
843                if(currentTick - tickWhenCached > 1)
844                    {
845                    evictees.add(worse);
846    //if(IsDebug.isDebug) { System.out.println("  [ScorerPopulation: scavenged old Scorer: " + _scorerScores.get(worse) + ".]"); }
847                    continue;
848                    }
849    
850                // Only proceed if the scores of the two Scorers are reasonably similar.
851                final ScoreAndConf sacW = pairW.second;
852                final ScoreAndConf sacB = _scorerScores.get(better).second;
853                // For scores very close to zero we only treat identical scores as similar...
854                if((SIMILAR_SCORE_DELTA == 0) || (Math.abs(sacB.score) < (SIMILAR_SCORE_DELTA<<4)))
855                    { if(sacW.score == sacB.score) { continue; } }
856                else if(Math.abs(sacW.score - sacB.score) > SIMILAR_SCORE_DELTA) { continue; }
857    
858                // Check for genotypic similarity...
859                // Don't discard a 'worse' Scorer sufficiently different to its 'better'.
860                if(!AbstractScorer.verySimilar(getScorerInstance(worse), getScorerInstance(better))) { continue; }
861    
862                // Mark this Scorer for removal (and push into the back-stop cache immediately).
863                evictees.add(worse);
864                _backstopLRU.put(worse, _scorerScores.get(worse));
865    //if(IsDebug.isDebug) { System.out.println("  [ScorerPopulation: scavenged 'clone' Scorer: " + _scorerScores.get(worse) + ".]"); }
866                }
867    
868            // If we didn't manage to evict more than one Scorer with out subtle weeding
869            // (which the quick-and-easy default non-scavenge mechanism would do anyway)
870            // then run an increasingly harsh filter until we fill our quota
871            // so as to pay for the time spent scavenging so far.
872            // We still will never evict the best Scorer,
873            // nor more than a small subset of all Scorers,
874            // and we still start evicting the poorest Scorers overall,
875            // so at worst this is probably similar to having a somewhat smaller maximum population size.
876            if(evictees.size() <= 1)
877                {
878                // Require adjacent Scorers by 'goodness' to be less and less similar to survive each pass
879                // until we manage to force enough Scorers out to give lots of elbow room for new entrants
880                // or all remaining Scorers really seem (genotypically) different
881                // (or we are effectively just forcing Scorers out by lottery).
882                final int nParams = getScorerInstance(subPop.first()).getParameterDefsAndValues().size();
883                // Start with small steps, ie using the finest razor available,
884                // but limit the maximum number of passes even for large parameter sets,
885                // to very approximately O(log(nParams)) unless we make good progress.
886                // We double the step if we did not find new evictees on the previous pass,
887                // else if we did clear some then we set the step back to 1 for maximum resolution.
888                boolean clearedOnLastPass; // True if we cleared anything on the last pass.
889                int step = 1; // Current similarity (decrease) step; strictly positive.
890                for(int similarity = nParams; (similarity > 0) && (evictees.size() < maxPurgeCount); similarity -= step, step = (clearedOnLastPass ? 1 : (2*step)))
891                    {
892    //if(IsDebug.isDebug) { System.out.println("_scavenge(): nParams="+nParams+", similarity="+similarity+", step="+step+", evictees.size()="+evictees.size()); }
893                    clearedOnLastPass = false;
894                    for(final Iterator<String> it = subPop.iterator(); it.hasNext() && (evictees.size() < maxPurgeCount); )
895                        {
896                        // Get 'less good' of the two Scorers that we will examine.
897                        final String worse = it.next();
898    
899                        // Never try and scavenge/remove the best (last) Scorer in this sub-population...
900                        if(!it.hasNext()) { break; }
901                        // Never try and remove the base/parameterless Scorer from this sub-population.
902                        if(worse.indexOf(AbstractScorer.SEPARATOR) == -1) { continue; }
903                        // Skip any Scorer already waiting to be purged...
904                        if(evictees.contains(worse)) { continue; }
905    
906                        // Get the 'better' of the two Scorers compared this time.
907                        assert(it.hasNext()); // We know from earlier test that there is at least one more Scorer.
908                        final String better = it.next();
909                        // Skip any Scorer already waiting to be purged...
910                        if(evictees.contains(better)) { continue; }
911    
912                        // Check for genotypic similarity...
913                        // Don't discard a 'worse' Scorer sufficiently different to its 'better'.
914                        if(!AbstractScorer.similarNParams(getScorerInstance(worse), getScorerInstance(better), similarity)) { continue; }
915    
916                        // Mark this Scorer for removal (and push into the back-stop cache immediately).
917                        evictees.add(worse);
918                        _backstopLRU.put(worse, _scorerScores.get(worse));
919                        clearedOnLastPass = true;
920    //if(IsDebug.isDebug) { System.out.println("  [ScorerPopulation: scavenged near ("+similarity+") 'clone' Scorer: " + _scorerScores.get(worse) + ".]"); }
921                        }
922                    }
923                }
924    
925            // Now efficiently purge all the evictees (if any) en masse.
926            if(!evictees.isEmpty())
927                {
928    //if(IsDebug.isDebug) { System.out.println("  [ScorerPopulation: scavenged "+evictees.size()+'/'+_scorerScores.size()+" Scorers at "+((System.currentTimeMillis()-startTime)/evictees.size())+"ms/Scorer.]"); }
929                // Must remove from sub-population before removing from _scorerScores.
930                subPop.removeAll(evictees);
931                _scorerScores.keySet().removeAll(evictees);
932                }
933            }
934    
935        /**Clear any caches which depend on the population state remaining unchanged.
936         * This clearing is done within the lock that protects the internal data structures
937         * (even though the caches can be safely accessed outside the lock)
938         * to minimise unwanted races (and eliminate them for lock-protected cache accesses).
939         */
940        private void _clearCaches()
941            {
942            synchronized(_lock)
943                {
944                // Clear any caches which may be otherwise invalid after an update.
945                _gCSWP_cache = null; // Will need to recompute the "best" Scorer set.
946                _gBS_cache.clear(); // Will need to recompute any breeding set.
947                }
948            }
949    
950        /**If true, allow posting the global event for exchange of the best/new Scorers.
951         * The mechanism must also be enabled in the GenProps for this to happen.
952         */
953        private static final boolean USE_GLOBAL_EVENTS_ALSO = true;
954    
955        /**Get current population size; non-negative.
956         * This value may slightly exceed the requested maximum.
957         * <p>
958         * This does not include the count of any entries in the back-stop LRU (negative) cache.
959         *
960         * @return count of unique Scorers that might be selected as part of a "breeding set"
961         */
962        public int size() { return(_scorerScores.size()); }
963    
964        /**Return the target maximum population size; non-negative.
965         * The actual population size may exceed this somewhat; this is a target.
966         */
967        public int maxSize() { return(maxPopulationSize); }
968    
969        /**Cache of result from getCurrentScorersWithParameters(); may be null if nothing cached.
970         * Initially null, and cleared on any update (put) to the population.
971         * <p>
972         * Any value cached here must be immutable for safety and sanity.
973         * <p>
974         * Marked volatile for thread-safe lock-free access.
975         */
976        private volatile Set<String> _gCSWP_cache;
977    
978        /**Current immutable set of available Scorer names and parameters (where applicable); never null but may be empty.
979         * The values returned are of the form <i>ScorerName{:name=value}*</i>.
980         * <p>
981         * The Scorers returned by this will generally be the best available,
982         * suitable for making a real prediction of and exhibit score,
983         * and will usually be the best one or handful per base-Scorer type,
984         * allowing that different Scorer types may have different "skill" domains,
985         * and excluding any with non-positive goodness.
986         * <p>
987         * This routine may be expensive, though will try to be efficient.
988         */
989        public Set<String> getCurrentScorersWithParameters(final boolean allowStale)
990            {
991            // Check the cache first.
992            // The cache is cleared by any put/update of the population.
993            final Set<String> cachedValue = _gCSWP_cache;
994            if(_gCSWP_cache != null) { return(cachedValue); }
995    
996            // Result set, of only +ve goodness Scorers.
997            final Set<String> r = new HashSet<String>(4 * baseScorers.size());
998    
999            // Hold a lock on the data structures while working.
1000            synchronized(_lock)
1001                {
1002                // Attempt to a candidate from each base type (sub population).
1003                for(final NavigableSet<String> subPop : _scorerByGoodness.values())
1004                    {
1005                    if(!subPop.isEmpty())
1006                        {
1007                        // Have the number of samples from each base name/type to include >= 1
1008                        // (zero iff there are no eligible candidates at all)
1009                        // and growing only (very) slowly with the population size.
1010                        // In any case we try to return a diverse group from each sub-population.
1011                        // We use log(log(n)) to allow for the possibility of multiple distinct useful candidates
1012                        // in very large populations, without wasting too much time on many near clones.
1013                        final int maxToInclude = Math.max(1, GenUtils.log2Approx(Math.max(1, GenUtils.log2Approx(subPop.size()))));
1014    
1015                        // Get the best Scorer of this type and its "goodness".
1016                        final String best = subPop.last();
1017                        final int bGoodness = ScoreAndConf.computeScorerGoodness(_scorerScores.get(best).second);
1018                        if(bGoodness <= 0) { continue; /* All bad! */ }
1019                        // We'll accept other Scorers nearly as good as the best,
1020                        // but NOT of identical goodness or otherwise too similar (pheno/genotypically)
1021                        // as we want (useful) variety in what we return, eg for multimodal matching.
1022                        final int threshold = bGoodness/2;
1023                        // Set of goodnesses so far: we don't return two identical.
1024                        final Set<Integer> goodnesses = new HashSet<Integer>(2 * maxToInclude);
1025                        // Set of scores so far: we don't return two identical.
1026                        final Set<Short> scores = new HashSet<Short>(2 * maxToInclude);
1027                        // Set of Scorer instances so far; we don't return two 'very similar'.
1028                        final Collection<ScorerIF> instances = new ArrayList<ScorerIF>(maxToInclude);
1029                        // Work backwards from the best (which is always included)...
1030                        int remaining = maxToInclude;
1031                        main: for(final String snp : subPop.descendingSet())
1032                            {
1033                            final ScoreAndConf sac = _scorerScores.get(snp).second;
1034                            final Short score = sac.score;
1035                            if(scores.contains(score)) { continue; /* Identical to a previous score. */ }
1036                            scores.add(sac.score);
1037                            final int g = ScoreAndConf.computeScorerGoodness(sac);
1038                            // Stop if the goodness falls too far (or to/below zero).
1039                            if(g <= threshold) { break; }
1040                            assert(g > 0); // Only keeping Scorers with merit.
1041                            final Integer G = g;
1042                            if(goodnesses.contains(G)) { continue; /* Identical to a previous goodness. */ }
1043                            goodnesses.add(G);
1044                            // Make sure that this Scorer is not of very similar genotype to a previous one.
1045                            final ScorerIF sc = getScorerInstance(snp);
1046                            assert(sc != null);
1047                            // We require increasingly-different Scorer instances as we gather more Scorers.
1048                            final int maxSimilarity = Math.max(0, sc.getParameterDefsAndValues().size() - instances.size());
1049                            for(final ScorerIF prevSc : instances)
1050                                { if(AbstractScorer.similarNParams(prevSc, sc, maxSimilarity)) { continue main; /* Similar genotype to extant returnees. */ } }
1051                            // Record new genetically-different Scorer.
1052                            instances.add(sc);
1053                            // This Scorer is good enough to return.
1054                            r.add(snp);
1055                            // Stop if we've collected enough.
1056                            if(--remaining <= 0) { break; }
1057                            }
1058                        assert(r.contains(best));
1059                        }
1060                    }
1061                }
1062    
1063            // We avoid cacheing empty results to assist with fast bootstrapping.
1064            if(r.isEmpty()) { return(Collections.emptySet()); }
1065            // Wrap the result to make it immutable, and cache it (unless empty).
1066            final Set<String> result = Collections.unmodifiableSet(r);
1067            _gCSWP_cache = result;
1068    
1069            return(result);
1070            }
1071    
1072        /**Cache of results for getBreedingSet(); never null.
1073         * Cleared by any put/update operation on the population.
1074         * <p>
1075         * Thread-safe map from (valid) baseName to breeding set.
1076         */
1077        private final ConcurrentMap<String, List<String>> _gBS_cache = new ConcurrentHashMap<String, List<String>>();
1078    
1079        /**Get immutable random-access list of Scorers to "breed" from, best first; never null, possibly empty.
1080         * This returns all of (or some of the best of) the Scorers with the given base name.
1081         * <p>
1082         * This generally trims away all Scorers with non-positive values,
1083         * but will always try to return the few best items regardless,
1084         * ie the result is not completely empty unless the specified sub-population is.
1085         * <p>
1086         * The result will generally be large if the population is.
1087         */
1088        public List<String> getBreedingSet(final String baseName)
1089            {
1090            // Validate the base name first.
1091            if(baseName == null) { throw new IllegalArgumentException(); }
1092            if(!baseScorers.keySet().contains(baseName)) { throw new IllegalArgumentException(); }
1093    
1094            final List<String> cachedValue = _gBS_cache.get(baseName);
1095            if(cachedValue != null) { return(cachedValue); }
1096    
1097            // Hold lock while accessing mutable data.
1098            synchronized(_lock)
1099                {
1100                // Get in "worst-first" order.
1101                final ArrayList<String> r = new ArrayList<String>(_scorerByGoodness.get(baseName));
1102                // Convert to "best-first" order.
1103                Collections.reverse(r);
1104                final int size = r.size();
1105                final int limit = size/4;
1106                // Eliminate all those with non-positive goodness if possible,
1107                // but don't remove the best ~25% however poor they may appear to be
1108                // else we will not have any material to breed (and try to improve) from.
1109                for(int i = size; --i > limit; )
1110                    {
1111                    final int goodness = ScoreAndConf.computeScorerGoodness(_scorerScores.get(r.get(i)).second);
1112                    if(goodness > 0) { break; }
1113                    // Remove this trailing zero-goodness item.
1114                    r.remove(i);
1115                    }
1116                // Result must not be empty unless the sub-population is.
1117                assert(_scorerByGoodness.get(baseName).isEmpty() || !r.isEmpty());
1118    
1119                // Wrap the result for immutablity and cache it.
1120                final List<String> result = Collections.unmodifiableList(r);
1121                _gBS_cache.putIfAbsent(baseName, result); // Reduce chance of update races.
1122    
1123                return(result);
1124                }
1125            }
1126    
1127        /**Interface of call-back hook to report a new "best" Scorer by name, eg to the system event record.
1128         */
1129        public interface NewBestCallbackIF
1130            {
1131            /**Used to post a new "best of breed" Scorer name and parameters for posterity.
1132             * This routine should not throw any exceptions.
1133             * @param scorerNameAndParameters  valid name-and-parameters set; never null.
1134             */
1135            public void reportNewBestScorer(final String scorerNameAndParameters);
1136            }
1137    
1138    
1139        /**Small thread-safe LRU cache for getScorerInstance(); never null.
1140         * Intended to avoid repeated re-parsing of args and recreation of instances, etc,
1141         * for very-commonly-used Scorers used to evaluate every exhibit.
1142         * Usually we'd expect this to be a handful actually needed regularly,
1143         * but while hunting for new solutions (or scavenging for space)
1144         * we'll expect to churn this cache quite hard.
1145         */
1146        private final SimpleLRUMapAutoSizeForHitRate<String, ScorerIF> _gSI_cache;
1147    
1148        /**Get Scorer instance given the Scorer{:value=name}* format; null if no such Scorer available.
1149         * Any instance returned may be a shared/cached instance rather than a new instance.
1150         * <p>
1151         * This does not attempt to canonicalise the names it is given,
1152         * so may create and cache duplicates for example.
1153         * <p>
1154         * This can be used to create any valid Scorer instance,
1155         * not just entries in the given population.
1156         */
1157        public ScorerIF getScorerInstance(final String nameAndParameters)
1158            {
1159            if(nameAndParameters == null) { throw new IllegalArgumentException(); }
1160    
1161            // Try lookup in the cache; return a cached result without further ado!
1162            final ScorerIF cached = _gSI_cache.get(nameAndParameters);
1163            if(cached != null) { return(cached); }
1164    
1165            // Quick extraction of base name.
1166            final int fs = nameAndParameters.indexOf(AbstractScorer.SEPARATOR);
1167            final boolean hasParams = (fs != -1);
1168            final String baseName = hasParams ? nameAndParameters.substring(0, fs) : nameAndParameters;
1169    
1170            // Get a base instance of the Scorer, if any.
1171            final ScorerIF sc = baseScorers.get(baseName);
1172            // If this base name is not known/supported, stop now.
1173            if(sc == null) { return(null); }
1174    
1175            // Fast-track the simple no-parameters case.
1176            if(!hasParams) { return(sc); }
1177    
1178            try
1179                {
1180                // Use base non-parameterised instance to create variant.
1181                final ScorerIF newInstance = sc.createVariant(nameAndParameters);
1182                assert(newInstance != null);
1183                // Cache and return this value.
1184                _gSI_cache.put(nameAndParameters, newInstance);
1185                return(newInstance);
1186                }
1187            // Handle a simple argument-format error by propagating up to caller as usual.
1188            catch(final IllegalArgumentException e) { throw e; }
1189            // Log and propagate any other error.
1190            catch(final Exception e) { e.printStackTrace(); throw new Error(e); }
1191            }
1192    
1193        }