001    /*
002    Copyright (c) 1996-2009, Damon Hart-Davis
003    All rights reserved.
004    
005    Redistribution and use in source and binary forms, with or without
006    modification, are permitted provided that the following conditions are
007    met:
008    
009      * Redistributions of source code must retain the above copyright
010        notice, this list of conditions and the following disclaimer.
011    
012      * Redistributions in binary form must reproduce the above copyright
013        notice, this list of conditions and the following disclaimer in the
014        documentation and/or other materials provided with the
015        distribution.
016    
017    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
018    IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
019    TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
020    PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
021    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
022    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
023    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
024    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
025    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
026    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
027    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
028    */
029    
030    package org.hd.d.pg2k.ai.scorer;
031    
032    import java.io.IOException;
033    import java.util.ArrayList;
034    import java.util.BitSet;
035    import java.util.Collections;
036    import java.util.Comparator;
037    import java.util.HashMap;
038    import java.util.HashSet;
039    import java.util.List;
040    import java.util.Map;
041    import java.util.Random;
042    import java.util.Set;
043    import java.util.SortedMap;
044    import java.util.TreeMap;
045    import java.util.concurrent.ArrayBlockingQueue;
046    import java.util.concurrent.ConcurrentHashMap;
047    import java.util.concurrent.ConcurrentMap;
048    import java.util.concurrent.ConcurrentSkipListMap;
049    
050    import org.hd.d.pg2k.ai.scorer.fixed.FixedScore;
051    import org.hd.d.pg2k.ai.scorer.fixed.MaxScore;
052    import org.hd.d.pg2k.ai.scorer.fixed.MinScore;
053    import org.hd.d.pg2k.ai.scorer.fixed.NoConfidence;
054    import org.hd.d.pg2k.ai.scorer.fixed.RandomPositiveScore;
055    import org.hd.d.pg2k.ai.scorer.fixed.RandomScore;
056    import org.hd.d.pg2k.ai.scorer.fixed.ZeroScore;
057    import org.hd.d.pg2k.ai.scorer.parameterised.LocalSampler;
058    import org.hd.d.pg2k.ai.scorer.parameterised.SimpleExposure;
059    import org.hd.d.pg2k.svrCore.AllExhibitImmutableData;
060    import org.hd.d.pg2k.svrCore.AllExhibitProperties;
061    import org.hd.d.pg2k.svrCore.CoreConsts;
062    import org.hd.d.pg2k.svrCore.ExhibitName;
063    import org.hd.d.pg2k.svrCore.ExhibitPropsComputableMutable;
064    import org.hd.d.pg2k.svrCore.GenUtils;
065    import org.hd.d.pg2k.svrCore.MemoryTools;
066    import org.hd.d.pg2k.svrCore.Name;
067    import org.hd.d.pg2k.svrCore.Rnd;
068    import org.hd.d.pg2k.svrCore.SimpleLoggerIF;
069    import org.hd.d.pg2k.svrCore.TextUtils;
070    import org.hd.d.pg2k.svrCore.ThreadUtils;
071    import org.hd.d.pg2k.svrCore.ExhibitPropsComputableMutable.Factor;
072    import org.hd.d.pg2k.svrCore.Name.ExhibitFull;
073    import org.hd.d.pg2k.svrCore.Name.ExhibitShort;
074    import org.hd.d.pg2k.svrCore.Tuple.Pair;
075    import org.hd.d.pg2k.svrCore.Tuple.Triple;
076    import org.hd.d.pg2k.svrCore.datasource.SimpleExhibitPipelineIF;
077    import org.hd.d.pg2k.svrCore.vars.EventPeriod;
078    import org.hd.d.pg2k.svrCore.vars.EventVariableValue;
079    import org.hd.d.pg2k.svrCore.vars.SimpleVariableDefinition;
080    import org.hd.d.pg2k.svrCore.vars.SystemVariables;
081    
082    import ORG.hd.d.IsDebug;
083    
084    
085    /**Simple/default implementation to compute (and cache) the score and confidence for exhibits.
086     * Note: since the result of this computation may be used in computing
087     * (EPCM) the ExhibitPropsComputableMutable value for an exhibit,
088     * then any implementation of this must avoid forcing recalculation
089     * of any EPCM value to avoid danger of infinite recursion
090     * (other than the static calcVoteFactor() method).
091     * Ideally the value computed will not depend on any EPCM value.
092     */
093    public final class ScorerCacheImpl extends AbstractScorerCache implements ScorerCacheIF
094        {
095        /**Construct an instance attached to the supplied data source.
096         * We automatically record/persist/share new best-of-breed Scorers via the event mechanism.
097         *
098         * @param dataSource  full-access live data source; must not be null
099         */
100        public ScorerCacheImpl(final SimpleExhibitPipelineIF dataSource,
101                               final SimpleLoggerIF log)
102            {
103            super(new ScorerPopulation(fixedSimpleScorers, MAX_SCORER_SCORES_RETAINED, dataSource),
104                  dataSource,
105                  log);
106    
107            // Set up our work/evolution object.
108            scorerWork = new ScorerCreator.ScorerWork(this, log, true);
109    
110            // Set up the inbound Scorer queue, size-limited.
111            inboundScorerQueue = new ArrayBlockingQueue<String>(MAX_SCORER_SCORES_RETAINED/16);
112    
113            // Set up the calibration set cache, size-limited.
114            // We bound the size by allowing several (>2) set sizes for each base Scorer type.
115            _eCS_cache = MemoryTools.SimpleLRUMapAutoSizeForHitRate.<Triple<String,Integer,Boolean>, Pair<Long, Map<Name.ExhibitShort,ScoreAndConf>>>create(1+fixedSimpleScorers.size(), 1 + 5*fixedSimpleScorers.size(), "_eCS_cache");
116            }
117    
118        /**Computes a weighted composite score [-1,+1] and confidence [0,+1] for the specified exhibit with the best available scorers/parameters; never null but may be (0,0).
119         * This uses the Scorers available from getCurrentScorersWithParameters(),
120         * but skipping any marked as BadScorer that have made it through filtering.
121         * <p>
122         * This explicitly limits the time it spends on computation,
123         * more so if the system is short of power.
124         * <p>
125         * This tries to force the evolution of new Scorers in the background
126         * if it finds none at all usable during filtering and computation.
127         * (This means that the Scorer system can bootstrap itself just by calls to this routine,
128         * but will force some possibly-wasteful work for exhibit types that cannot be scored.)
129         *
130         * @param exhibitName  valid full exhibit name
131         * @param allowStale  if true then allow a stale value from cache,
132         *     else throw an exception if nothing is currently available
133         *
134         * @return (0,0) if named scorer is not available,
135         *         cannot be used with the specified parameters,
136         *         or cannot be applied to the indicated exhibit
137         *         (eg because of the exhibit type or the exhibit does not exist);
138         *         else returns a non-null ScoreAndConf value
139         */
140        public ScoreAndConf computeCompositeScoreAndConfidence(final Name.ExhibitFull exhibitName,
141                                                               final boolean allowStale)
142            throws IOException
143            {
144            if(!ExhibitName.validNameSyntax(exhibitName)) { throw new IllegalArgumentException("exhibit name must be full, not short"); }
145    
146            final long startTime = System.currentTimeMillis();
147            final long endTime = startTime + 1 +
148                (GenUtils.mustConservePower() ? 0 : CoreConsts.MAX_INTERACTIVE_DELAY_MS/4);
149    
150            // We apply each available non-zero-confidence Scorer to the exhibit,
151            // and compute a single overall score.
152            // We don't cache this result;
153            // we recompute it each time as it should be quick to so do
154            // as all of its components are cached.
155            // Note that this may be VERY slow if it forces all Scorers to be (re)evaluated.
156    
157            // Compute results from individual eligible Scorers.
158            // Because of the lack of symmetry in scoring
159            // (ie reversing the value of a bad Scorer does not necessarily make it a good one)
160            // we are only interested in the highest-confidence, positive-correlation Scorers.
161            final SortedMap<ScoreAndConf, ScorerIF> results = new TreeMap<ScoreAndConf, ScorerIF>(ScoreAndConf.ByConfidence);
162            Set<String> currentScorersWithParameters = getCurrentScorersWithParameters(allowStale);
163            // As a bootstrap measure on an (effectively) empty population
164            // try to force evaluation of the base Scorers here,
165            // some of which may have some predictive merit anyway.
166            // We also try to kick off a background thread to create new Scorers.
167            if(currentScorersWithParameters.isEmpty())
168                {
169                // Try the base Scorer set as a fallback.
170                currentScorersWithParameters = getBaseScorersWithoutParameters();
171                // Try to kick-start Scorer creation in the background with a reasonable chunk of CPU.
172                evolve(GenUtils.mustConservePower());
173                }
174            // Now evaluate this exhibit with each Scorer,
175            // in a random order to avoid bias in case we run out of time at some point.
176            final List<String> shuffledCSWP = new ArrayList<String>(currentScorersWithParameters);
177            Collections.shuffle(shuffledCSWP, Rnd.fastRnd);
178            for(final String snp : shuffledCSWP)
179                {
180                // Compute the weighting over all available exhibits.
181                final ScoreAndConf scorerWeighting = computeScorerWeighting(snp, allowStale, null);
182                // Quickly reject those Scorers with a non-positive score or confidence.
183                if(scorerWeighting.score <= 0) { continue; }
184                if(scorerWeighting.confidence == 0) { continue; }
185                // Get Scorer instance.
186                final ScorerIF sc = getScorerInstance(snp);
187                assert(sc != null);
188                // Now "score" the exhibit with this Scorer.
189                final ScoreAndConf score = computeUnweightedScoreAndConfidence(exhibitName, sc, allowStale);
190                // Apply the weighting to the score, rejecting any whose confidence is now zero.
191                // Note that we only use the sign of the Scorer correlation (score) for the score,
192                // but we multiply both the Scorer correlation and confidence into the final confidence.
193                final ScoreAndConf weighted = new ScoreAndConf(score.score,
194                                                               (int) ((score.confidence * scorerWeighting.confidence * (long)scorerWeighting.score) / (ScoreAndConf.MAX * (long) ScoreAndConf.MAX)));
195                if(weighted.confidence == 0) { continue; }
196                assert(weighted.confidence > 0);
197                results.put(weighted, sc);
198    
199                // Try to cap total runtime providing that we have at least one result so far...
200                if(System.currentTimeMillis() > endTime) { break; }
201                }
202    
203            final int rSize = results.size();
204            // Handle trivial cases quickly...
205            // If no applicable results then return "no opinion",
206            // and try to quickly evolve some usable Scorers.
207            if(rSize == 0) { evolve(true); return(ScoreAndConf.NO_OPINION); }
208    
209            // Get the keys (S&C values) in rising order by confidence (ignoring sign).
210            List<ScoreAndConf> sorted = new ArrayList<ScoreAndConf>(results.keySet());
211            assert(sorted.get(0).confidence <= sorted.get(sorted.size()-1).confidence);
212    
213            // Discard the least-confident results if we have more than a few.
214            // The aim is to reduce noise and improve "strength" of answer.
215            // We do this BEFORE weeding out any "straw man" dummy/fake/test Scorers
216            // so as to help push out any very weak real Scorers (weaker than chance/faking).
217            // We don't discard too many Scorers so as to allow for multi-modal ranking
218            // by a reasonable diversity of Scorer types.
219            // A real Scorer is only guaranteed to make the result if better than all the fakes.
220            final int toDiscard = (rSize >>> 1); // FIXME: Optimise/document this fraction.
221            if(toDiscard > 0) { sorted = sorted.subList(toDiscard, rSize); }
222            // Now weed out any remaining dummy/stooge/test/benchmark Scorers.
223            for(int i = sorted.size(); --i >= 0; )
224                { if(results.get(sorted.get(i)) instanceof BadScorer) { sorted.remove(i); } }
225            // Recompute the size.
226            final int finalSize = sorted.size();
227    
228            // Handle trivial cases quickly...
229            // If no applicable results then return "no opinion",
230            // and try to evolve some usable Scorers.
231            if(finalSize == 0) { evolve(true); return(ScoreAndConf.NO_OPINION); }
232            // If exactly one result then return it "as is".
233            if(finalSize == 1) { return(sorted.get(0)); }
234    
235            // Get the highest confidence value.
236            final int highestConfidence = sorted.get(finalSize-1).confidence;
237            assert(sorted.get(0).confidence <= highestConfidence);
238            // Compute the result score as the mean score*confidence
239            // (thus contributing to the result in proportion to confidence)
240            // normalised to be score*1 for the highest confidence,
241            // and the confidence being the RMS confidence of all contributors
242            // (to be biased towards greater confidence in the result
243            // when a number of low-confidence Scorers are included).
244            // Work downward from the best, stopping if we run out of time.
245            long totalSC = 0;
246            long totalConfSq = 0;
247            int valuesUsed = 0;
248            for(int i = finalSize; --i >= 0; )
249                {
250                final ScoreAndConf sc = sorted.get(i);
251                totalSC += (((sc.score * sc.confidence) * (long)ScoreAndConf.MAX) / highestConfidence);
252                totalConfSq += (sc.confidence * sc.confidence);
253                ++valuesUsed;
254                // Try to cap total runtime.
255                if(System.currentTimeMillis() > endTime) { break; }
256                }
257            assert(valuesUsed > 0);
258            // Normalise the score prediction to be able to run the full gammut [-MAX,+MAX].
259            final long rawCompositeScore = totalSC / (ScoreAndConf.MAX * valuesUsed);
260    //if(IsDebug.isDebug) { System.out.println("rSize = " + finalSize); }
261    //if(IsDebug.isDebug) { System.out.println("rawCompositeScore = " + rawCompositeScore); }
262            final long rawCompositeConf = Math.round(Math.sqrt(totalConfSq / valuesUsed));
263    //if(IsDebug.isDebug) { System.out.println("rawCompositeConf = " + rawCompositeConf); }
264            return(new ScoreAndConf((short) Math.min(ScoreAndConf.MAX, Math.max(-ScoreAndConf.MAX, rawCompositeScore)),
265                                    (short) Math.min(ScoreAndConf.MAX, Math.max(0, rawCompositeConf))));
266            }
267    
268    
269        /**Maximum number of Scorers for which the system retains/caches scores; strictly positive.
270         * This limits the amount of memory dedicated to cache, etc.
271         * <p>
272         * In practice 2^13--2^16 seems to be a reasonable limit.
273         */
274        private static final int MAX_SCORER_SCORES_RETAINED = (1 << 13);
275    
276        /**The minimum recent period for which we collect all available votes, in ms; strictly positive.
277         * This is usually at least a year to smooth out for seasonal patterns.
278         */
279        private static final long MIN_VOTE_SAMPLE_NEAR_TERM_TIME_MS = 365242L * 24 * 3600; // 365.242 days in a year.
280    
281        /**ScoreAndConfidence for the given scorer itself over all exhibit types; never null but may be (0,0) where the scorer is unknown or untested.
282         * Essentially the result of this should be multiplied by the result for each exhibit
283         * (for the same scorer and parameters)
284         * to normalise the predicted score and confidence for the exhibit,
285         * though usually the sign of the correlation (score) is multiplied into the raw score,
286         * while the value of correlation (score) is multiplied with the confidence
287         * into the final confidence.
288         * <p>
289         * We attempt to ignore small numbers of errors for robustness.
290         *
291         * @param allowStale  if true then allow a stale or low-confidence value from cache,
292         *     else throw an exception if nothing is currently available
293         *     and we cannot quickly compute enough points to increase our confidence
294         * @return  the score represents the correlation with the underlying votes
295         *     (and whatever the scoring is measured against)
296         *     with MAX meaning perfect correlation, 0 meaning no correlation,
297         *     and -MAX meaning perfectly wrong answers all the time,
298         *     and the confidence 0 if we have no (or very/too few) data points
299         *     and approaching MAX as we have a large (enough) number of data points
300         */
301        public ScoreAndConf computeScorerWeighting(final ScorerIF scorer,
302                                                   final boolean allowStale, final String source)
303            throws IOException
304            {
305            if(scorer == null) { throw new IllegalArgumentException(); }
306    
307            // Return what we have in cache, if anything.
308            final ScoreAndConf cachedValue = population.getScorerWeighting(scorer, allowStale);
309            if(cachedValue != null) { return(cachedValue); }
310    
311            final AllExhibitImmutableData aeid = dataSource.getAllExhibitImmutableData(-1);
312            // If there are no live exhibits then we can't compute (nor cache) a result.
313            // This is especially important to avoid cacheing spurious false results at start-up.
314            if(aeid.length == 0) { return(ScoreAndConf.NO_OPINION); }
315    
316            // Compute weighting (effectively the correlation) from scratch.
317            // Use a decent-quality subset of exhibits when the system is short of memory or power,
318            // in particular because this is likely to help reduce memory pressure
319            // from reducing the number of benchmark images being cached and checked against.
320            final boolean trim = (!MemoryTools.lotsFree()) || GenUtils.mustConservePower();
321            final Map<Name.ExhibitShort, ScoreAndConf> benchmarkExhibits = getVotedForExhibitsAndVoteFactors(allowStale, trim);
322            // If there are no voted-for exhibits then we cannot compute (nor cache) a result.
323            if(benchmarkExhibits.isEmpty()) { return(ScoreAndConf.NO_OPINION); }
324    
325            // Compute the calibration result.
326            final Pair<ScoreAndConf, Boolean> weighting = ScorerCreator.computeWeighting(benchmarkExhibits, this, scorer, aeid, allowStale);
327            final ScoreAndConf result = weighting.first;
328            final boolean enoughData = !weighting.second;
329    
330            // Atomically store/replace the result in cache
331            // iff enough data points were available and we hadn't run out of time, etc.
332            // We allow values computed with potentially-stale inputs to be cached
333            // as probably reasonably accurate if other conditions are met.
334            // Note that if the Scorer that we have just generated is the best
335            // then it may be persisted/propagated automatically by the population object.
336            if(enoughData)
337                {
338                final boolean isBest = population.putScorerWeighting(scorer, result, dataSource, log);
339                if(isBest)
340                    {
341                    log.log("[Computed/cached 'best' Scorer from source '"+source+"'; score and confidence "+result+"; "+scorer.getNameAndParameters()+" .]");
342                    }
343                }
344    
345            return(result);
346            }
347    
348    
349        /**Compute voted-for-exhibit set; never null but may be empty.
350         * This attempts to retrieve the latest vote event sets for all or most time slots
351         * (concentrating on more recent ones if necessary)
352         * and computes the set of all exhibits that have a vote.
353         * <p>
354         * This does <em>NOT</em> attempt to filter the result in any way,
355         * ie by which exhibits are still live or even syntactically-valid.
356         *
357         * @return  the (non-null, possibly empty) set of voted-for-exhibits (short names)
358         *     up to the last full VLONG slot
359         *     paired with the time at which we started to compute/fetch it
360         * @throws IOException  if the data is not available or there is a timeout
361         */
362        private Pair<Long, Set<Name.ExhibitShort>> _computeRawVotedForExhibits(final SimpleExhibitPipelineIF vars)
363            throws IOException
364            {
365            final long startTime = System.currentTimeMillis();
366            final Set<Name.ExhibitShort> exhibits = new HashSet<Name.ExhibitShort>(2*MIN_VOTED_FOR_SET_SIZE);
367    
368            // Get vote events for all retained full/complete (VLONG) slots.
369            final long currentPeriod = EventPeriod.VLONG.getIntervalNumber(startTime);
370            final BitSet whichValues = new BitSet(SystemVariables.EVENT_SAMPLES_RETAINED);
371            whichValues.set(0, SystemVariables.EVENT_SAMPLES_RETAINED);
372            final SimpleVariableDefinition[] defs = new SimpleVariableDefinition[]{ SystemVariables.VOTE_PRO, SystemVariables.VOTE_CON };
373            for(final SimpleVariableDefinition def : defs)
374                {
375                final EventVariableValue evvsAll[] = vars.getEventValues(
376                    def, EventPeriod.VLONG, currentPeriod, whichValues);
377                for(final EventVariableValue evv : evvsAll)
378                    {
379                    if((evv != null) && (evv.getTotalEventCount() != 0))
380                        {
381                        // We expect the keys to be short exhibit names
382                        // which we convert to Name.ExhibitShort values.
383                        final AllExhibitImmutableData aeid = vars.getAllExhibitImmutableData(-1);
384                        for(final Object o : evv.getDistinctValuesInRankOrder())
385                            {
386                            final ExhibitFull fullName = aeid.getFullName((String) o);
387                            if(null == fullName) { continue; } // Can only happen if the data set changes..
388                            exhibits.add(fullName.getShortName());
389                            }
390                        }
391                    }
392                }
393    
394            return(new Pair<Long, Set<Name.ExhibitShort>>(startTime, exhibits));
395            }
396    
397        /**Cache of untrimmed voted-for exhibits (by short name) and their converted vote factors; never null.
398         * The exhibit set is those that have been voted for or against,
399         * possibly including no-longer valid/extant exhibit names,
400         * because we will be using this for calibration.
401         * <p>
402         * Private to getVotedForExhibits().
403         * <p>
404         * We index from the time at which we gathered the set
405         * so that we can regenerate or trim this list if the AEP or event slot changes.
406         * <p>
407         * Because the (first) map is sorted it is quick to find the latest/oldest entries.
408         * <p>
409         * Usually we only keep one value (the latest one),
410         * and any older values are purged only AFTER a new value has been inserted,
411         * ie this map never becomes transiently empty nor more stale than necessary once non-empty.
412         * <p>
413         * We can keep older information around to allow some (stale) results to be computed
414         * even if we cannot immediately fetch new information.
415         * <p>
416         * The outer and inner maps are thread-safe.
417         * <p>
418         * Do not assume that other activity in this collection can be locked out
419         * by holding a lock on this object.  DO NOT hold a lock on this object!
420         * <p>
421         * The values must be immutable to enable them to be shared safely without copying.
422         * <p>
423         * TODO: clear this safely when memory is stressed.
424         */
425        private final SortedMap<Long, Map<Name.ExhibitShort, ScoreAndConf>> _votedForExhibitsUntrimmed =
426            new ConcurrentSkipListMap<Long, Map<Name.ExhibitShort,ScoreAndConf>>();
427    
428        /**Cache of trimmed voted-for exhibits (by short name) and their converted vote factors; never null.
429         * Cache of trimmed values otherwise as for _votedForExhibitsUntrimmed.
430         * <p>
431         * Only pure votes are in this selection; no proxies/substitutes.
432         */
433        private final SortedMap<Long, Map<Name.ExhibitShort, ScoreAndConf>> _votedForExhibitsTrimmed =
434            new ConcurrentSkipListMap<Long, Map<Name.ExhibitShort,ScoreAndConf>>();
435    
436        /**Nominal amount of heap space (bytes) to be free to accommodate one voted-for exhibit as well as normal Gallery operations; strictly positive.
437         * No more than say 25% to 50% of this amount of heap space should actually be used
438         * (eg by cached data such as expanded/sampled images for quick processing, and other artifacts)
439         * for each member of the voted-for set.
440         */
441        private static final int FREE_HEAP_PER_VFE = 1024 * 1024;
442    
443        /**Maximum heap space per voted-for exhibit to support AI Scorer work in bytes; strictly positive.
444         * This should cover all memory resources,
445         * but is not an absolute per item limit nor an invitation to use up to the limit.
446         */
447        public static final int MAX_BYTES_PER_VOTED_FOR_EXHIBIT = FREE_HEAP_PER_VFE / 2;
448    
449        /**If true then fold in download/viewing stats with explicit voting.
450         * This may make for a more robust and topical/accurate view of popularity.
451         */
452        private static final boolean ALLOW_VIEWING_STATS_ALONGSIDE_VOTES = true;
453    
454        /**Maximum confidence (+ve/-ve) for vote stand-in data in range [0,ScoreAndConf.MAX].
455         * We will allow our confidence in a stats-derived ersatz 'vote' to be no higher than this,
456         * nor even as high as any actual real vote factors confidence value,
457         * ie so real votes beat synthetic ones every time.
458         * <p>
459         * Usually a small fraction of ScoreAndConf.MAX to prefer real votes by a huge margin.
460         */
461        private static final short MAX_NON_VOTE_CONF = ScoreAndConf.MAX >>> 3;
462    
463        /**If true then when allowing viewing stats alongside votes we can score exhibits for not being viewed, etc.
464         * One problem with generating negative scores for exhibits without positive information
465         * is that we may hugely inflate the universe of exhibits to be examined
466         * (with an inflation in CPU/memory/other resources consumed)
467         * without necessarily adding much value or information.
468         * <p>
469         * In particular the memory burden of holding many thumbnails in memory for processing
470         * may be significant.
471         * <p>
472         * This should usually be false.
473         */
474        private static final boolean ALLOW_FILL_IN_FOR_ABSENT_STATS_AND_VOTES = false;
475    
476        /**Get latest (immutable) Map of all voted-for exhibits (by short name) to vote score; never null.
477         * Returns a set of extant exhibits that have been voted for or against.
478         * <p>
479         * Uses a cache to avoid wasted computation from one event tick to the next,
480         * and to allow this mechanism to work even in the face of brief outages
481         * in data connectivity (eg when trying to fetch new values of the vote events).
482         * <p>
483         * We convert the vote Factor to a ScoreAndConf value so that more computations
484         * can be done as integer rather than floating, especially important for Niagara.
485         *
486         * @param allowStale  if true, allow older data to be returned to save time,
487         *     or if new event data should be used but cannot be fetched
488         *     (ie allow working from stale event data for speed and/or robustness)
489         * @param trim  if true then trim the result to a minimal useful set
490         *     (ie typically no more than MIN_VOTED_FOR_SET_SIZE, dropping low-confidence values),
491         *     else we may supplement the vote data with other usage stats
492         *     if the result size would be MIN_VOTED_FOR_SET_SIZE
493         */
494        private Map<Name.ExhibitShort, ScoreAndConf> getVotedForExhibitsAndVoteFactors(final boolean allowStale,
495                                                                                       final boolean trim)
496            throws IOException
497            {
498            final long startTime = System.currentTimeMillis();
499    
500            // What is the current period (ie for non-stale data)?
501            final long currentPeriod = EventPeriod.VLONG.getIntervalNumber(startTime);
502    
503            // Select appropriate (trimmed or full) value cache.
504            final SortedMap<Long, Map<Name.ExhibitShort, ScoreAndConf>> _votedForExhibits =
505                trim ? _votedForExhibitsTrimmed : _votedForExhibitsUntrimmed;
506            // Spin to retrieve latest cached result that is current (up-to-date).
507            // (If allowStale is true, we allow a previous-tick result.)
508            // This relies on:
509            //  1) the map never going empty once non-empty
510            //  2) the map never getting more stale
511            //     (refetching the last key can only get the same or a newer value)
512            // This can only return with a "current" (enough) item.
513            // We fall through if an appropriate value is not available.
514            if(!_votedForExhibits.isEmpty())
515                {
516                while(EventPeriod.VLONG.getIntervalNumber(_votedForExhibits.lastKey()) >= (allowStale ? currentPeriod-1 : currentPeriod))
517                    {
518                    final Map<Name.ExhibitShort, ScoreAndConf> result = _votedForExhibits.get(_votedForExhibits.lastKey());
519                    if(null != result) { return(result); }
520                    }
521                }
522    
523            // No suitable cached up-to-date value,
524            // so try to compute one (and cache it) now.
525            // If this cannot complete (generates an exception) and allowStale is false
526            // then we propagate that exception upwards.
527            Pair<Long, Set<Name.ExhibitShort>> rawSet = null;
528            IOException err = null;
529            if(!allowStale)
530                { rawSet = _computeRawVotedForExhibits(dataSource); }
531            else
532                { try { rawSet = _computeRawVotedForExhibits(dataSource); } catch(final IOException e) { err = e; /* Record for later. */ } }
533    
534            // If we got something, then cache it, even though it *may* now not be the newest.
535            if(rawSet != null)
536                {
537                final AllExhibitProperties aep = dataSource.getAllExhibitProperties(-1);
538                // We refuse to cache the result from an empty AEP as possibly erroneous.
539                if(aep.aeid.length == 0) { return(Collections.emptyMap()); }
540    
541                final Set<Name.ExhibitShort> vfe = rawSet.second;
542                final Map<Name.ExhibitShort, ScoreAndConf> m = new HashMap<Name.ExhibitShort, ScoreAndConf>(2 * vfe.size());
543                for(final Name.ExhibitShort shortName : vfe)
544                    {
545                    // If the exhibit is missing (eg has been deleted or renamed) then skip it.
546                    final Name.ExhibitFull fullName = aep.aeid.getFullName(shortName);
547                    if(fullName == null) { continue; }
548    
549                    // Compute vote-based rating.
550                    final Factor voteFactor = ExhibitPropsComputableMutable.calcVoteFactor(fullName, aep, dataSource, MIN_VOTE_SAMPLE_NEAR_TERM_TIME_MS);
551                    final int confidence = Math.min(ScoreAndConf.MAX, Math.max(0, Math.round(voteFactor.correlation * ScoreAndConf.MAX)));
552                    // If vote correlation for this exhibit is too low to use then skip it.
553                    if(confidence == 0) { continue; }
554                    final int score = Math.min(+ScoreAndConf.MAX, Math.max(-ScoreAndConf.MAX, Math.round(voteFactor.goodness * ScoreAndConf.MAX)));
555                    final ScoreAndConf sac = new ScoreAndConf(score, confidence);
556    
557                    // Store a useful vote factor in the map.
558                    m.put(shortName, sac);
559    //if(IsDebug.isDebug && (voteFactor.goodness > -1) && (voteFactor.goodness < +1)) { System.out.println("Middling vote Factor = "+voteFactor+" for "+fullName); }
560                    }
561    
562                // How many pure votes are available...
563                final int pureVoteSetSize = m.size();
564    
565                // The target result size.
566                // When trimming we have this the minimum acceptable size,
567                // else we potentially expand it to make use of abundant free heap space.
568                final int targetVFSSize = trim ? MIN_VOTED_FOR_SET_SIZE :
569                    Math.max(MIN_VOTED_FOR_SET_SIZE, (int) Math.min(Runtime.getRuntime().freeMemory() / FREE_HEAP_PER_VFE, Integer.MAX_VALUE));
570    log.log("ScorerCacheImpl.getVotedForExhibitsAndVoteFactors(): pure vote factors vs target: "+pureVoteSetSize+" vs "+targetVFSSize);
571    
572                // If we've been asked to trim the response
573                // and as gathered so far it is bigger than the minimum 'preferred' size,
574                // then we try to trim away the weakest (lowest confidence) items
575                // to pare the result back to minimum preferred/acceptable size.
576                if(trim && (pureVoteSetSize > targetVFSSize))
577                    {
578                    // Compute sorted list of all keys,
579                    // sorted primarily in ascending order by confidence
580                    // with a secondary, essentially-randomising sort
581                    // by name hash and other transient data,
582                    // allowing discard of exactly enough values to trim to MIN_VOTED_FOR_SET_SIZE
583                    // with otherwise-tied-by-confidence victims effectively selected randomly.
584                    final List<Name.ExhibitShort> namesByAscendingConf = new ArrayList<Name.ExhibitShort>(m.keySet());
585                    Collections.sort(namesByAscendingConf, new Comparator<Name.ExhibitShort>() {
586                        public int compare(final Name.ExhibitShort o1, final Name.ExhibitShort o2)
587                            {
588                            final ScoreAndConf sac1 = m.get(o1);
589                            final ScoreAndConf sac2 = m.get(o2);
590                            // Primary sort is by ascending confidence.
591                            final int cDiff = sac1.confidence - sac2.confidence;
592                            if(cDiff != 0) { return(cDiff); }
593                            // Effectively pick a random order to choose victims exhibits to discard.
594                            // This stays stable during any one sort,
595                            // but will probably change from sort to sort
596                            // as new ScoreAndConf instances are created.
597                            final int h1 = o1.hashCode() ^ System.identityHashCode(sac1);
598                            final int h2 = o2.hashCode() ^ System.identityHashCode(sac2);
599                            if(h1 < h2) { return(-1); }
600                            if(h1 > h2) { return(+1); }
601                            // Force a total ordering.
602                            return(TextUtils.compare(o1, o2));
603                            }
604                       });
605                    // Zap the lowest-confidence entries.
606                    final int toZap = pureVoteSetSize - targetVFSSize;
607                    final List<Name.ExhibitShort> zapList = namesByAscendingConf.subList(0, toZap);
608    log.log("ScorerCacheImpl.getVotedForExhibitsAndVoteFactors(): trimming "+toZap+"...");
609    for(int i = 0; i < Math.min(5, toZap); ++i) { log.log("ScorerCacheImpl.getVotedForExhibitsAndVoteFactors(): sac: " + m.get(zapList.get(i)) + " for " + zapList.get(i)); }
610                    m.keySet().removeAll(zapList);
611    
612    log.log("ScorerCacheImpl.getVotedForExhibitsAndVoteFactors(): pure vote factors after trim: "+m.size());
613                    }
614    
615                // Else if we're allowing viewing stats of any sort as a substitute for (absent) votes
616                // and we don't have a reasonable number of voted-for exhibits
617                // and we haven't been requested to trim the response size,
618                // then if an exhibit has any showing in the long-term stats at all we take that to be positive,
619                // else we take an absence to be (slightly) negative (if allowed), weighted appropriately,
620                // but in any case less than one or few explicit votes would have been,
621                // ie we prefer real votes to any proxies.
622                // In this case we use catalogue-page viewing stats,
623                // high-volume and maybe a truer vox-pop than exhibit download.
624                else if(ALLOW_VIEWING_STATS_ALONGSIDE_VOTES &&
625                       !trim &&
626                       (pureVoteSetSize < targetVFSSize))
627                    {
628                    do  { // Fake loop to allow a quick abort in case of difficulty...
629                        final EventVariableValue[] aPop =
630                            dataSource.getEventValues(SystemVariables.ACCESSPATTERN_CAT_PAGE_VIEW,
631                                EventPeriod.VLONG,
632                                0,
633                                null);
634                        // ABORT if we can't get the 'all' set of cat-page-view stats...
635                        if((aPop.length == 0) || (aPop[0] == null))
636                            { log.log("getVotedForExhibitsAndVoteFactors(): CANNOT FILL IN FOR TOO-FEW VOTES: no 'all' viewing stats."); break; }
637                        final EventVariableValue viewingStats = aPop[0];
638    
639                        // Pad out our map with values driven by viewing stats,
640                        // with a confidence capped to be less than the lowest non-zero real-vote confidence.
641    
642                        // First compute ready for our synthesised entries a confidence
643                        // just (one) less than the lowest confidence (if any) derived from explicit votes.
644                        // We don't allow this to be more than a low fixed ceiling.
645                        // That allows any 'real' votes to 'win'.
646                        short synthConf = MAX_NON_VOTE_CONF;
647                        for(final ScoreAndConf sac : m.values())
648                            {
649                            final short aSac = (short) (Math.abs(sac.confidence) - 1);
650                            if(aSac < synthConf) { synthConf = aSac; }
651                            }
652                        // ABORT if ceiling confidence is too low to use.
653                        if(synthConf <= 0)
654                            { log.log("getVotedForExhibitsAndVoteFactors(): CANNOT FILL IN FOR TOO-FEW VOTES: minimum real-vote confidence too low."); break; }
655    
656                        // Generate a value for every single exhibit if possible.
657                        // Anything ranked over half-way up all possible exhibits is +ve, else -ve.
658                        // (Most things probably can't rank because of the finite size of the recorded stats.)
659    
660                        // Fixed (and shared) low-ish 'unranked' penalty value (if allowing 'fill-in').
661                        // As it happens we'll use the same magnitude as the confidence ceiling for now.
662                        final ScoreAndConf unrankedPen = (!ALLOW_FILL_IN_FOR_ABSENT_STATS_AND_VOTES) ? null :
663                            MemoryTools.intern(new ScoreAndConf(-MAX_NON_VOTE_CONF, synthConf));
664    
665                        final int exhibitCount = aep.aeid.size();
666                        // Make the cut-off rank low enough to exclude at least half the exhibits,
667                        // and possibly set even lower to reflect the fact that we may only need
668                        // a small number of additional ersatz 'votes'
669                        // to generate a reasonable sample-set size.
670                        final int rankThreshold = Math.min(exhibitCount / 2, targetVFSSize - pureVoteSetSize);
671                        for(final Name.ExhibitShort shortName : aep.aeid.getAllExhibitShortNamesArraySorted())
672                            {
673                            // If there is already a real-vote-derived value then we don't need to do anything.
674                            if(m.containsKey(shortName)) { continue; }
675    
676                            // Synthesise a value in the absence of a real vote...
677                            final int rank = viewingStats.getRank(shortName.toString()); // Lower rank is better, zero is top/best.
678                            if(rank < rankThreshold)
679                                {
680                                final int score = Math.min(+ScoreAndConf.MAX, Math.max(1,
681                                    (int) ((ScoreAndConf.MAX * (long) (rankThreshold - rank)) / rankThreshold)));
682                                // Store positive score value...
683                                m.put(shortName, new ScoreAndConf(score, synthConf));
684                                }
685                            else if(ALLOW_FILL_IN_FOR_ABSENT_STATS_AND_VOTES)
686                                {
687                                m.put(shortName, unrankedPen); // No ranking: penalise mildly...
688                                }
689    //log.log("getVotedForExhibitsAndVoteFactors(): computed synthetic ScoreAndConf "+m.get(shortName)+" for "+shortName);
690                                }
691    
692    log.log("ScorerCacheImpl.getVotedForExhibitsAndVoteFactors(): vote factors plus selected stats-derived factors: "+m.size());
693                        } while(false); // End of fake loop to allow abort...
694                    }
695    
696                // Cache our new result (against our assumed-effectively-unique timestamp)...
697                final Map<Name.ExhibitShort, ScoreAndConf> result = Collections.unmodifiableMap(m);
698                _votedForExhibits.put(rawSet.first, result);
699                // Now zap anything older (ie stale).
700                // This does not remove our new entry, and therefore guarantees not to empty the map.
701                // Concurrent updates may leave multiple entries in the map until another update,
702                // but the map basically remains bounded in size.
703                Long oldest;
704                while((oldest = _votedForExhibits.firstKey()).longValue() < rawSet.first.longValue())
705                    { _votedForExhibits.remove(oldest); }
706    
707                // Return this known-good value.
708                return(result);
709                }
710    
711            // We didn't manage to compute a new/current/up-to-date result,
712            // but the caller has allowed us to return a stale result,
713            // so return the latest value from the cache, if any.
714            assert(allowStale);
715            if(!_votedForExhibits.isEmpty())
716                {
717                while(true)
718                    {
719                    final Map<Name.ExhibitShort, ScoreAndConf> result = _votedForExhibits.get(_votedForExhibits.lastKey());
720                    if(null != result) { return(result); }
721                    }
722                }
723    
724            // The cache was empty so rethrow the error that we got earlier
725            // when trying to compute a new/current/up-to-date value.
726            assert(err != null);
727            throw err;
728            }
729    
730        /**The (small) immutable current fixed set of parameterless base Scorer instances.
731         * Note that since these are fixed and immutable and have no parameters,
732         * we can safely create and store instances here.
733         * <p>
734         * These include the test/bogus Scorers as well as the live Scorers.
735         */
736        public static final Map<String, ScorerIF> fixedSimpleScorers;
737        /**Initialise fixedSimpleScorers. */
738        static
739            {
740            final ScorerIF simpleScorers[] =
741                {
742                // Bogus/fake/test Scorers.
743                new NoConfidence(),
744                new RandomScore(),
745                new RandomPositiveScore(),
746                new ZeroScore(),
747                new MinScore(),
748                new MaxScore(),
749                new FixedScore(),
750    
751                // True Scorers.
752                new SimpleExposure(),
753                new LocalSampler(),
754                // TODO: add new Scorer types here...
755                };
756            // We use a HashMap for speed of lookup.
757            final HashMap<String, ScorerIF> m = new HashMap<String, ScorerIF>(2 * simpleScorers.length);
758            for(final ScorerIF sc : simpleScorers)
759                { m.put(sc.getBaseName(), sc); }
760            fixedSimpleScorers = Collections.unmodifiableMap(m); // Make read-only.
761            }
762    
763        /**Minimum desirable site of set of voted-for exhibits for calibration to be fully confident of the result; strictly positive.
764         * This may determine the sizing of various memory structures.
765         * <p>
766         * Though empirically 512 has been observed to be a good value previously,
767         * now trimmed as of 2009/10 with severe memory pressure at the 256MB-heap master.
768         */
769        static final int MIN_VOTED_FOR_SET_SIZE = 256;
770    
771        /**Base set of available Scorers' names (no parameters); never null but may be empty.
772         * The values returned are of the form <i>ScorerName</i>.
773         */
774        public Set<String> getBaseScorersWithoutParameters()
775            { return(fixedSimpleScorers.keySet()); }
776    
777        /**Get base non-parameterised Scorer by name; null if no such base Scorer supported.
778         * @param  baseName  base (no parameters) name of Scorer; must not be null
779         */
780        public ScorerIF getBaseScorerByName(final String baseName)
781            { return(fixedSimpleScorers.get(baseName)); }
782    
783        /**Iterations allowed in extractCalibrationSet() to try to find an optimal result; strictly positive.
784         * This should probably be at least 3 or 4; and can much higher if calculation is quick.
785         */
786        private static final int MAX_ECS_ITERATIONS = 4;
787    
788        /**If true then select a partly-random selection of Scorers to measure error in _computeCalibrationSetError(). */
789        private static final boolean STATISTICAL_SCORER_SAMPLING_CCS = false;
790    
791        /**Max Scorer sample size in _computeCalibrationSetError() to prevent overflow in calculations. */
792        private static final int MAX_SCORER_SAMPLE_SIZE_CCS = 1 << (62 /*Allow sign bit + 1 spare*/ / 4);
793    
794        /**Compute 'error' in calibration set for the given population and cache contents; non-negative.
795         * This regards a computed calibration set as 'perfect' (zero error) if:
796         * <ul>
797         * <li>It predicts the same Scorers in the same order for the given basename
798         *     (or with the same composite score if the supplied base name is null).
799         *     We will generally take a sampling of Scorers of similar size
800         *     to the calibration exhibit set.
801         * <li>It marks the best Scorer as the best (or very nearly so),
802         *     ie so that anything that seems better than the 'best' Scorer by the computed set
803         *     will very likely prove to be so when measured against the full set.
804         * </ul>
805         * <p>
806         * This implicitly assumes that using the population ranking/comparator
807         * is the same as calibrating against the input calibration data
808         * (though that may prove false for stale cache values, etc).
809         * This avoids potentially-expensive recomputation against all input data.
810         * <p>
811         * Note that nothing is ever removed from the workspace (if supplied)
812         * making it (thread-) safe to share even between concurrent calls,
813         * but meaning also that its data may be bulky and get stale
814         * if preserved longer than the call to extract one calibration set.
815         * Use of this retained workspace should however save a great deal of time.
816         *
817         * @param putativeCalibrationExhibits  map of proposed exhibits (short names)
818         *     for calibration set to their real original input data (eg votes);
819         *     never null nor empty
820         * @param breedingSet  set of 'best' Scorers (of a single base type) or null for a generic set;
821         *     never empty
822         * @param workspace  if non-null then this is some workspace opaque to the caller
823         *     that can be used to save some results from one call to the next
824         *     (or even between concurrent calls, though this may not be efficient)
825         *     while evaluating the error for a single calibration set;
826         *     the caller must not alter this object
827         *
828         * @return  zero for an apparently 'perfect' computed calibration set,
829         *     with increasingly positive values for increasingly-badly-performing computed calibration sets
830         */
831        private int _computeCalibrationSetError(final Map<Name.ExhibitShort, ScoreAndConf> putativeCalibrationExhibits,
832                                                final List<String> breedingSet,
833                                                final ConcurrentMap<String, ConcurrentMap<Name.ExhibitShort, ScoreAndConf>> workspace)
834            throws IOException
835            {
836            assert((putativeCalibrationExhibits != null) && !putativeCalibrationExhibits.isEmpty());
837            assert((breedingSet == null) || !breedingSet.isEmpty());
838    
839            // For the moment this cannot cope with the generic (no-breeding-set) case
840            // so we regard the proposed calibration set as error-free since we cannot improve on it.
841            if(breedingSet == null)
842                {
843    if(IsDebug.isDebug) { System.err.println("WARNING: _computeCalibrationSetError() cannot yet handle generic case"); }
844                return(0);
845                }
846    
847            // Sort breeding set by descending rank/goodness using the population's comparator.
848            // Should probably be in-order anyway, but this ensures consistency.
849            final List<String> sortedBreedingSet = new ArrayList<String>(breedingSet);
850            // Extract the 'best' Scorer in the population
851            // so that we can ensure that it remains 'best' when measured by the calibration set.
852            final String bestScorer = sortedBreedingSet.get(0);
853    
854            final int pCESize = putativeCalibrationExhibits.size();
855            // Compute the ordered sampling of Scorers that we will use to measure calibration error.
856            final int breedingSetSize = sortedBreedingSet.size();
857            // Our final Scorer sample cannot be larger than the breeding set size...
858            final int maxScorerSampleSize = Math.min(Math.min(pCESize, breedingSetSize), MAX_SCORER_SAMPLE_SIZE_CCS);
859            final List<String> scorerSample = new ArrayList<String>(maxScorerSampleSize);
860            // The target size is ideally proportional to the exhibit size.
861            if(breedingSetSize <= maxScorerSampleSize)
862                {
863                // If there are fewer available Scorers than our target size
864                // then use them all, in order.
865                scorerSample.addAll(sortedBreedingSet);
866                }
867            else if(STATISTICAL_SCORER_SAMPLING_CCS)
868                {
869                // Indices of Scorers to use.
870                final BitSet samples = new BitSet(breedingSetSize);
871                // Always include the 'best' Scorer in the sample that we test
872                // since we always want the calibration set to recognise it as 'best'
873                // so that any Scorer that seems better when measured against the calibration set
874                // probably is better when compared against the full input (votes) data.
875                samples.set(0);
876                int bitsSet = 1;
877                // Choose the remaining Scorer samples at random,
878                // but limit the time that we spend, especially if we want nearly all of them.
879                // It doesn't matter if we end up with a slightly undersized sample set.
880                assert(samples.cardinality() == bitsSet);
881                for(int rounds = 4*maxScorerSampleSize; (rounds-- >= 0) && (bitsSet < maxScorerSampleSize); )
882                    {
883                    // Select a putative index (after the first one, always included).
884                    final int b = 1 + Rnd.goodRnd.nextInt(breedingSetSize-1);
885                    // Try again if already taken.
886                    if(samples.get(b)) { continue; }
887                    // Else if new, mark the bit.
888                    samples.set(b);
889                    ++bitsSet;
890                    }
891                assert(samples.cardinality() == bitsSet);
892                assert(bitsSet <= maxScorerSampleSize);
893                // Now walk the bitset, copying in the samples in order.
894                for(int i = samples.nextSetBit(0); i >= 0; i = samples.nextSetBit(i+1))
895                    { scorerSample.add(sortedBreedingSet.get(i)); }
896                assert(bitsSet == scorerSample.size());
897                }
898            else
899                {
900                // Just take the 'best'/top Scorers.
901                scorerSample.addAll(sortedBreedingSet.subList(0, maxScorerSampleSize));
902                }
903            final int scorerSampleSize = scorerSample.size();
904            assert(scorerSampleSize > 0);
905            assert(maxScorerSampleSize > 0);
906            assert(scorerSampleSize <= maxScorerSampleSize); // FIXME: failing
907    
908            // Map from Scorer name to map from (short) exhibit name to Scorer's predictions for that name.
909            // This allows us to perform an exhibit-ordered loop,
910            // so that we test the same exhibit with each sample Scorer in turn
911            // for best memory locality/performance.
912            // NOTE THAT WE NEVER REMOVE OR OVERWRITE ANY EXTANT ENTRY IN THIS MAP OR SUB-MAPS.
913            final ConcurrentMap<String, ConcurrentMap<Name.ExhibitShort, ScoreAndConf>> byScorerScores;
914            if(workspace == null)
915                {
916                // No workspace supplied; create our own temporary workspace now.
917                byScorerScores = new ConcurrentHashMap<String, ConcurrentMap<Name.ExhibitShort,ScoreAndConf>>(2*scorerSampleSize);
918                }
919            else { byScorerScores = workspace; }
920            // Ensure that we have a sub-map set up for each Scorer in this sample.
921            for(final String snp : scorerSample)
922                {
923                if(byScorerScores.get(snp) == null)
924                    { byScorerScores.putIfAbsent(snp, new ConcurrentHashMap<Name.ExhibitShort, ScoreAndConf>(2*pCESize)); }
925                }
926            final AllExhibitImmutableData aeid = dataSource.getAllExhibitImmutableData(-1);
927            for(final Name.ExhibitShort exhibitShortName : putativeCalibrationExhibits.keySet())
928                {
929                final Name.ExhibitFull exhibitName = aeid.getFullName(exhibitShortName);
930                if(exhibitName == null) { continue; }
931                for(final String snp : scorerSample)
932                    {
933                    final ConcurrentMap<Name.ExhibitShort, ScoreAndConf> submap = byScorerScores.get(snp);
934                    // If we already have a result, avoid expensively recomputing it.
935                    if(submap.get(exhibitShortName) != null) { continue; }
936                    // We allow stale data for speed and robustness.
937                    final ScoreAndConf sac = computeUnweightedScoreAndConfidence(exhibitName, population.getScorerInstance(snp), true);
938                    submap.putIfAbsent(exhibitShortName, sac);
939                    }
940                }
941            // Compute the overall ScoreAndConf weighting (accuracy) for each sample Scorer.
942            final Map<String, ScoreAndConf> byScorerWeighting = new HashMap<String, ScoreAndConf>(2*scorerSampleSize);
943            for(final String snp : scorerSample)
944                {
945                final ScoreAndConf weight = ScorerCreator.computeWeighting(byScorerScores.get(snp), putativeCalibrationExhibits);
946                byScorerWeighting.put(snp, weight);
947                }
948            // Now compute the ranking of the sample Scorers using the calibration set.
949            // The 'goodness' is the primary sort.
950            // This may have similar secondary sorting to the population ranking, but that is not essential.
951            final Comparator<String> comp = new Comparator<String>()
952                {
953                // This treats missing items from _scorerScores as neutral (goodness 0).
954                public final int compare(final String a, final String b)
955                    {
956                    final ScoreAndConf sacA = byScorerWeighting.get(a);
957                    final ScoreAndConf sacB = byScorerWeighting.get(b);
958                    // Primary sort by goodness.
959                    final int goodnessA = (sacA == null) ? 0 : ScoreAndConf.computeScorerGoodness(sacA);
960                    final int goodnessB = (sacB == null) ? 0 : ScoreAndConf.computeScorerGoodness(sacB);
961                    if(goodnessA < goodnessB) { return(-1); }
962                    if(goodnessA > goodnessB) { return(+1); }
963                    if((sacA != null) && (sacB != null))
964                        {
965                        // Secondary sort by score (if both scores available).
966                        if(sacA.score < sacB.score) { return(-1); }
967                        if(sacA.score > sacB.score) { return(+1); }
968                        }
969                    // Break ties by name to ensure a total ordering.
970                    return(a.compareTo(b));
971                    }
972                };
973            final List<String> calibSetRankedScorers = new ArrayList<String>(scorerSample);
974            Collections.sort(calibSetRankedScorers, comp);
975            assert(calibSetRankedScorers.size() == scorerSampleSize);
976            // If we get exactly the same ordering then this seems to be a perfect calibration set.
977            if(calibSetRankedScorers.equals(scorerSample))
978                {
979    if(IsDebug.isDebug) { System.out.println("[_computeCalibrationSetError() exact match!]"); }
980                return(0);
981                }
982    
983            // Compute the RMS distance of each Scorer from its original ranking.
984            // FIXME: this value can reach scorerSampleSize^4.
985            long sqTotal = 0;
986            for(int i = scorerSampleSize; --i >= 0; )
987                {
988                final String snp = scorerSample.get(i);
989                final int newIndex = calibSetRankedScorers.indexOf(snp);
990                assert(newIndex >= 0); // All re-ranked Scorers must exist in the sample set and vv.
991                // Compute error in ranking (distance between correct and actual index).
992                final int distance = Math.abs(i - newIndex);
993                final int importance = (scorerSampleSize-i);
994                // We include a factor weighting the mispositioning of better Scorers as more serious.
995                if(distance != 0) { sqTotal += distance*distance * importance; }
996                }
997            assert(sqTotal >= 0); // Crude check for overflow.
998            // We adjust the RMS error to be no more than Integer.MAX_VALUE/2.
999            // FIXME: (max) RMS value is currently proportional to sample size^1.5: needs normalising...
1000            final int errorRMS = Math.max(0, (int) Math.round(Math.sqrt(sqTotal / (double) scorerSampleSize)));
1001    if(IsDebug.isDebug) { System.out.println("[_computeCalibrationSetError(): RMS error = "+errorRMS*2+" for Score sample size of "+scorerSampleSize+".]"); }
1002    
1003            // The most important feature is that the original 'best' Scorer should still be best or very close.
1004            // This is key to ensuring that this smaller calibration set
1005            // is a reasonable proxy for the full set of input calibration/vote data.
1006    //        final boolean bestIsStillBest = bestScorer.equals(calibSetRankedScorers.get(0));
1007            final boolean bestIsStillNearlyBest = calibSetRankedScorers.indexOf(bestScorer) <= (scorerSampleSize >>> 5);
1008            // If the best *is* still (nearly) best, then the calibration error is simply the RMS error.
1009            if(bestIsStillNearlyBest) { return(errorRMS); }
1010    if(IsDebug.isDebug) { System.out.println("[_computeCalibrationSetError(): does not identify 'best' correctly; at position/index "+calibSetRankedScorers.indexOf(bestScorer)+"]"); }
1011            // Else we add on a penalty that is guaranteed to outweigh any RMS error.
1012            return(errorRMS + (Integer.MAX_VALUE >>> 1));
1013            }
1014    
1015        /**Private cache for extractCalibrationSet(); never null.
1016         * Thread-safe LRU cache with capped size.
1017         * <p>
1018         * Map from (effectively) basename/size/errors to a calibration set and its time of computation.
1019         * <p>
1020         * The size parameter is the largest (non-negative) power of two no higher than the request size
1021         * (its rounded-down log to the base 2),
1022         * thus mapping arbitrary requested maximum sizes to a smaller number of powers of two.
1023         * (Note that this mapping may be further conflated/condensed to improve the cache effectiveness.)
1024         * <p>
1025         * May be trimmed right down automatically in case of memory shortage.
1026         */
1027        private final MemoryTools.SimpleLRUMapAutoSizeForHitRate<Triple<String,Integer,Boolean>, Pair<Long, Map<Name.ExhibitShort,ScoreAndConf>>> _eCS_cache;
1028    
1029        /**Compute exemplar sub-set of exhibits to calibrate Scorers with given base name against; never null but may be empty.
1030         * This implementation extracts the best available Scorer of the given base name (if any)
1031         * and finds some of the exhibits that it best predicts and worst predicts
1032         * and an even sampling of those in between and some random samplings of that median range.
1033         * The result should usually be a result for which the weighting of the selected Scorer
1034         * may be reasonably close to zero or to the mean of its accuracy range.
1035         * <p>
1036         * This may return an empty result if there is not enough data to calibrate against,
1037         * eg human-case votes.
1038         * <p>
1039         * It may be possible to tune or pre-test new Scorers against the results of this
1040         * as a fast filter.
1041         * <p>
1042         * If a base name is specified that is invalid, it is treated as if null.
1043         * <p>
1044         * We may cache generated results for a short while since this extraction may be expensive,
1045         * though the cache time is of the order of hours rather than the day or so elsewhere.
1046         *
1047         * @param baseName  base name of Scorer to extract calibration set for,
1048         *     or null for a generic all-Scorers calibration set
1049         * @param maxSamples  the maximum number of samples to return; strictly positive
1050         * @param difficult  if TRUE the return the difficult cases that we do not predict well,
1051         *     if FALSE then return the easy cases that we predict well,
1052         *     else return a mixture of good, bad, and other random cases
1053         * @param allowStale  if true then allow slightly older data for speed and robustness
1054         *
1055         * @return map of zero-or-more exhibits (short names) to calibration-data values; non-null
1056         */
1057        public Map<ExhibitShort, ScoreAndConf> extractCalibrationSet(final String baseName,
1058                                                                     final int maxSamples,
1059                                                                     final Boolean difficult,
1060                                                                     final boolean allowStale)
1061            throws IOException
1062            {
1063            if(maxSamples < 1) { throw new IllegalArgumentException(); }
1064    
1065            // Capture the start time early for a conservative cache timestamp.
1066            final long startTime = System.currentTimeMillis();
1067            final Long stL = new Long(startTime);
1068    
1069            // Get the voted-for exhibits, from which we can draw our calibration set.
1070            // Iff the system is stressed then work from a pre-trimmed sub-set,
1071            // but otherwise use everything available so as to maximise our view and coverage.
1072            final boolean trim = ((!MemoryTools.lotsFree()) || GenUtils.mustConservePower());
1073            final Map<Name.ExhibitShort, ScoreAndConf> votes = getVotedForExhibitsAndVoteFactors(allowStale, trim);
1074            final int vfSize = votes.size();
1075            // Return an empty result immediately (and don't cache it) if there are no votes to calibrate against.
1076            if(vfSize == 0) { return(Collections.emptyMap()); }
1077    
1078            // Get best Scorers with given base name, if any, best first.
1079            final List<String> breedingSet = ((baseName == null) || !getBaseScorersWithoutParameters().contains(baseName)) ?
1080                    null : population.getBreedingSet(baseName);
1081            // The effective base name being used, ie null if "as-if" null.
1082            final boolean noBreedingSet = (breedingSet == null) || breedingSet.isEmpty();
1083            final String effectiveBaseName = noBreedingSet ? null : baseName;
1084            final List<String> effectiveBreedingSet = noBreedingSet ? null : breedingSet;
1085    
1086            // If the set of voted-for exhibits is no larger than the maximum samples requested
1087            // and no basename is given (ie is null) or no suitable Scorers exist for that basename
1088            // and a mixture of difficult and easy cases has been requested
1089            // then return the full voted-for-exhibits list as-is for speed.
1090            if(noBreedingSet && (vfSize <= maxSamples) && !(difficult instanceof Boolean))
1091                { return(votes); }
1092    
1093            // We cannot ever generate a calibration set larger than the number of voted-for exhibits.
1094            final int cappedMaxSamples = Math.min(maxSamples, vfSize);
1095            assert(cappedMaxSamples > 0);
1096            // We'll round this to the nearest power of two not larger than this maximum.
1097            // And this log value is part of our key for efficient/dense representation.
1098            final int cMSLog = GenUtils.log2Approx(cappedMaxSamples);
1099            // Expand back to an actual size limit.
1100            final int roundedMaxSamples = 1 << cMSLog;
1101            assert(roundedMaxSamples <= cappedMaxSamples); // No larger than the original request.
1102            assert(roundedMaxSamples >= cappedMaxSamples/2); // Not less than half the original request size.
1103    
1104            // Compute the cache key.
1105            final Triple<String, Integer, Boolean> cacheKey = new Triple<String, Integer, Boolean>(effectiveBaseName, cMSLog, difficult);
1106            // Look up in cache...
1107            final Pair<Long, Map<Name.ExhibitShort, ScoreAndConf>> cachedValue = _eCS_cache.get(cacheKey);
1108            // If we have a cached result, and it is not too old, then return it immediately.
1109            if(cachedValue != null)
1110                {
1111                // If not allowing stale values then we insist on a value from the same LONG 'tick'
1112                // which should force also recomputation when our larger VLONG-timed caches get recomputed,
1113                // else we allow a longer grace period.
1114                final long intervalNumber = EventPeriod.LONG.getIntervalNumber(startTime);
1115                final long intervalNumberCached = EventPeriod.LONG.getIntervalNumber(cachedValue.first);
1116                if(intervalNumber == intervalNumberCached)
1117                    { return(cachedValue.second); }
1118                // Allow a couple of ticks for a stale value...
1119                if(allowStale && (intervalNumber - intervalNumberCached < 2))
1120                    { return(cachedValue.second); }
1121                // Fall through to recompute the value...
1122                }
1123    
1124            // If no baseName or no breeding set for the specified base name
1125            // then generate the generic calibration set.
1126            // If we do have a breeding set, etc, select the best item from the breeding set.
1127            final String bestOfBreed = ((breedingSet == null) || breedingSet.isEmpty()) ? null :
1128                breedingSet.get(0);
1129            final boolean noBoB = (bestOfBreed == null);
1130            final ScorerIF scBoB = noBoB ? null : population.getScorerInstance(bestOfBreed);
1131    
1132    if(IsDebug.isDebug) { System.out.println("ScorerCacheImpl.extractCalibrationSet(): bestOfBreed="+bestOfBreed+" for baseName="+baseName); }
1133    
1134            // Set of exhibits (short names) that we will return data for.
1135            final Set<Name.ExhibitShort> exhibits = new HashSet<Name.ExhibitShort>(2*roundedMaxSamples);
1136    
1137            // If there are no more voted-for exhibits than the max samples requested,
1138            // then we can just return all available voted-for exhibits
1139            // and we need not do any complex/tricky/expensive selection at all.
1140            if(roundedMaxSamples >= votes.size())
1141                { exhibits.addAll(votes.keySet()); }
1142            else
1143                {
1144                // Select example exhibits from the full/available set.
1145    
1146                // We need to be able to convert short names to long names...
1147                final AllExhibitImmutableData aeid = dataSource.getAllExhibitImmutableData(-1);
1148    
1149                // Get composite results for all voted-for exhibits
1150                // ranked by ascending closeness to the right answer (descending error*confidence).
1151                // Higher closeness implies successful prediction,
1152                // so the outliers are our hard/easy bad/good samples.
1153                // However, we omit exhibits with zero confidence and zero scores
1154                // as probably indicating that we may simply not have any usable Scorers at all for them.
1155                // Allow this to swallow some I/O and calculation problems
1156                // (not enough to make a significant statistical difference,
1157                // but enough to ride out problems with individual exhibits, network glitches, etc).
1158                final int maxErrors = Math.max(1, roundedMaxSamples >>> 6); // ~1% errors will be ignored.
1159                // Error count during computation.
1160                int errorCount = 0;
1161                final Map<Name.ExhibitShort, ScoreAndConf> closenessScores = new HashMap<Name.ExhibitShort, ScoreAndConf>(2*vfSize);
1162                // We evaluate the exhibits in random order to eliminate order-based biases.
1163                final List<Name.ExhibitShort> shuffledExhibits = new ArrayList<Name.ExhibitShort>(votes.keySet());
1164                Collections.shuffle(shuffledExhibits, new Random(Rnd.fastRnd.nextLong()));
1165                for(final Name.ExhibitShort n : shuffledExhibits)
1166                    {
1167                    final Name.ExhibitFull fullName = aeid.getFullName(n);
1168                    if(fullName == null) { continue; }
1169    
1170                    final ScoreAndConf sac;
1171                    try
1172                        {
1173                        sac = noBoB ?
1174                            computeCompositeScoreAndConfidence(fullName, allowStale) :
1175                            computeUnweightedScoreAndConfidence(fullName, scBoB, allowStale);
1176                        }
1177                    catch(final IOException e)
1178                        {
1179                        // If error count gets too high then abort...
1180                        if(++errorCount > maxErrors) { throw e; }
1181                        // ...else just skip this exhibit.
1182                        else { continue; }
1183                        }
1184                    // Skip this exhibit if it cannot be scored at all (eg wrong type).
1185                    if((sac.confidence == 0) && (sac.score == 0)) { continue; }
1186                    final ScoreAndConf voteSac = votes.get(n);
1187                    // The error [0,+MAX] is (half) the absolute difference
1188                    // between the actual vote and the predicted score
1189                    // multiplied by the cross-confidence (normalised)
1190                    // so that lower-confidence pairs are weighted more neutrally.
1191                    // The closeness/score is MAX - error.
1192                    // The confidence is the cross-confidence.
1193                    // These metrics are chosen to work with composite and unweighted scores.
1194                    final int crossConfidence = sac.confidence * voteSac.confidence;
1195                    final long error = (Math.abs(sac.score - voteSac.score) * (long)crossConfidence)
1196                        / (2L * ScoreAndConf.MAX * ScoreAndConf.MAX);
1197                    assert((error >= 0) && (error <= ScoreAndConf.MAX));
1198                    final ScoreAndConf closeness = new ScoreAndConf(ScoreAndConf.MAX - (int)error, crossConfidence/ScoreAndConf.MAX);
1199                    closenessScores.put(n, closeness);
1200                    }
1201                // Return an empty result immediately (and don't cache it) if there are no scores to calibrate against.
1202                if(closenessScores.isEmpty()) { return(Collections.emptyMap()); }
1203                // Now compute closeness-sorted (lowest-first) results by exhibit.
1204                final List<Name.ExhibitShort> byCloseness = new ArrayList<Name.ExhibitShort>(closenessScores.keySet());
1205                Collections.sort(byCloseness, new Comparator<Name.ExhibitShort>() {
1206                    public final int compare(final Name.ExhibitShort n1, final Name.ExhibitShort n2)
1207                        {
1208                        final ScoreAndConf sac1 = closenessScores.get(n1);
1209                        final ScoreAndConf sac2 = closenessScores.get(n2);
1210                        // Primary sort by closeness (score).
1211                        if(sac1.score < sac2.score) { return(-1); }
1212                        if(sac1.score > sac2.score) { return(+1); }
1213                        // Secondary sort (tie breaking) by confidence.
1214                        if(sac1.confidence < sac2.confidence) { return(-1); }
1215                        if(sac1.confidence > sac2.confidence) { return(+1); }
1216                        // Treat as identical.
1217                        return(0);
1218                        }
1219                    });
1220                assert(!byCloseness.isEmpty());
1221                assert(byCloseness.size() == closenessScores.size());
1222                final int bCSize = byCloseness.size();
1223                final int targetResultSize = Math.min(roundedMaxSamples, bCSize);
1224                assert((bCSize == 0) || (closenessScores.get(byCloseness.get(0)).score <= closenessScores.get(byCloseness.get(bCSize-1)).score));
1225                // The number of outliers we will use depends on the "difficult" parameter.
1226                // If a mixture of values has been requested then we include the 25% best and 25% worst.
1227                // Don't have too many extreme cases: "Hard cases make bad law."
1228                final int nOutliersUsed = (difficult == null) ? (targetResultSize >>> 2) : targetResultSize;
1229                // UNLESS only easy cases were were requested THEN add in the most difficult (lowest closeness) cases.
1230                if(!Boolean.FALSE.equals(difficult))
1231                    { exhibits.addAll(byCloseness.subList(0, nOutliersUsed)); }
1232                // UNLESS only difficult cases requested THEN add in the easiest (highest closeness) cases.
1233                if(!Boolean.TRUE.equals(difficult))
1234                    { exhibits.addAll(byCloseness.subList(bCSize-nOutliersUsed, bCSize)); }
1235    
1236                // Top up result set to requested size with intermediate and random samples
1237                // when a mix of results has been requested.
1238                if(difficult == null)
1239                    {
1240                    // Create a view of the median gap from which we have no samples yet.
1241                    final List<Name.ExhibitShort> median = byCloseness.subList(nOutliersUsed, bCSize-nOutliersUsed);
1242                    final int medianSize = median.size();
1243    
1244                    // Inject some representative roughly-evenly-spaced samples
1245                    // (roughly half of the remaining free slots in the result).
1246                    // There is likely to be some bias given the integer arithmetic involved.
1247                    final int medianRegularSampleCount = Math.min(medianSize, targetResultSize-exhibits.size()) / 2;
1248                    if(medianRegularSampleCount > 0)
1249                        {
1250                        final int stepSize = medianSize / medianRegularSampleCount;
1251                        for(int i = (stepSize+1)/2; (i < medianSize) && (exhibits.size() < targetResultSize); i += stepSize)
1252                            { exhibits.add(median.get(i)); }
1253                        }
1254    
1255                    // Retain some expensive-to-compute values
1256                    // between evaluations of our putative calibration sets.
1257                    final ConcurrentHashMap<String, ConcurrentMap<Name.ExhibitShort, ScoreAndConf>> workspace = new ConcurrentHashMap<String, ConcurrentMap<Name.ExhibitShort,ScoreAndConf>>();
1258    
1259                    // Find a selection of random samples that gives the best results,
1260                    // by measuring the result calibration set error with different values.
1261                    int error = Integer.MAX_VALUE;
1262                    Set<Name.ExhibitShort> bestResult = null;
1263                    for(int i = MAX_ECS_ITERATIONS; --i >= 0; )
1264                        {
1265                        final Set<Name.ExhibitShort> putativeCalibSet = new HashSet<Name.ExhibitShort>(2 * targetResultSize);
1266                        putativeCalibSet.addAll(exhibits);
1267    
1268                        // Top-up with random samples from the median range.
1269                        // Only samples from the median range are potentially absent from the result so far,
1270                        // so that is the only place that we look for (new) random samples from.
1271                        // This may be slow if the desired result size is very close (below) the source size.
1272                        // So we limit the effort that we expend, and use a faster generator in that case;
1273                        // normally we use a good generator to keep our result as random as possible.
1274                        final Random r = (targetResultSize < medianSize/2) ? Rnd.goodRnd : Rnd.fastRnd;
1275                        for(int j = 2*medianSize; (putativeCalibSet.size() < targetResultSize) && (--j >= 0); )
1276                            { putativeCalibSet.add(byCloseness.get(r.nextInt(medianSize))); }
1277    
1278                        // Compute the 'error' in this putative calibration set.
1279                        final Map<Name.ExhibitShort, ScoreAndConf> calibMap = new HashMap<Name.ExhibitShort, ScoreAndConf>(votes);
1280                        assert(!calibMap.isEmpty());
1281                        assert(!putativeCalibSet.isEmpty());
1282                        calibMap.keySet().retainAll(putativeCalibSet);
1283                        assert(!calibMap.isEmpty()) : "putativeCalibSet = " + new ArrayList<Name.ExhibitShort>(putativeCalibSet);
1284                        final int pErr = _computeCalibrationSetError(calibMap, effectiveBreedingSet, workspace);
1285    if(IsDebug.isDebug) { System.out.println("[ScorerCacheImpl.extractCalibrationSet(): calibration error = "+pErr+" (attempts remaining "+i+").]"); }
1286                        assert(pErr >= 0);
1287                        // If this is the best so far then record it.
1288                        if(pErr < error)
1289                            {
1290                            error = pErr;
1291                            bestResult = putativeCalibSet;
1292                            // If the calibration error is zero then we can stop immediately.
1293                            if(pErr == 0) { break; }
1294                            }
1295                        }
1296    
1297                    // If we found a good result then use it.
1298                    if(bestResult != null) { exhibits.clear(); exhibits.addAll(bestResult); }
1299                    }
1300    //if(IsDebug.isDebug) { System.out.println("[ScorerCacheImpl.extractCalibrationSet(): vfSize/maxSamples/bCSize/exhibits.size()="+vfSize+'/'+maxSamples+'/'+bCSize+'/'+exhibits.size()+".]"); }
1301                }
1302    
1303            // Compute the result map.
1304            final Map<Name.ExhibitShort, ScoreAndConf> r = new HashMap<Name.ExhibitShort, ScoreAndConf>(2*exhibits.size());
1305            // Copy in the vote factor/score for each exemplar exhibit.
1306            for(final Name.ExhibitShort n : exhibits)
1307                { r.put(n, votes.get(n)); }
1308    
1309            // Compute the goodness of our model Scorer against this calibration set.
1310            if(bestOfBreed != null)
1311                {
1312                final AllExhibitImmutableData aeid = dataSource.getAllExhibitImmutableData(-1);
1313                final Pair<ScoreAndConf, Boolean> weighting = ScorerCreator.computeWeighting(r, this, population.getScorerInstance(bestOfBreed), aeid, allowStale);
1314    if(IsDebug.isDebug) { System.out.println("$$$ bestOfBreed SAC and goodness against calibration set: " + weighting.first + ", goodness=" + ScoreAndConf.computeScorerGoodness(weighting.first)); }
1315                }
1316    
1317            // Wrap result for immutability and cache it.
1318            final Map<Name.ExhibitShort, ScoreAndConf> result = Collections.unmodifiableMap(r);
1319            _eCS_cache.put(cacheKey, new Pair<Long, Map<Name.ExhibitShort, ScoreAndConf>>(stL, result));
1320    
1321            return(result);
1322            }
1323    
1324    
1325        /**Scorer work object; never null. */
1326        private final ScorerCreator.ScorerWork scorerWork;
1327    
1328        /**Expected approximate poll cycle time (ms); strictly positive. */
1329        private static final int APPROX_POLL_CYCLE_MS = 1000;
1330    
1331        /**Do some 'evolution' in a background thread if possible.
1332         * If we have too much background activity going on this will do nothing.
1333         * <p>
1334         * This routine always returns quickly.
1335         * <p>
1336         * This should never throw any exceptions.
1337         *
1338         * @param  minimal  if true, run as short a time as possible to make some progress
1339         */
1340        private void evolve(final boolean minimal)
1341            {
1342            ThreadUtils.lowPriorityThreadPoolDiscardable.submit(new Runnable(){
1343            public final void run()
1344                {
1345                // Set a time to aim to stop processing by on this run.
1346                //
1347                // This is usually greater than the expected cycle time
1348                // to try to enable multiple threads to run in parallel on a multi-CPU server
1349                // (at minimum priority to be friendly to other processing)
1350                // up to the limit imposed by the thread pool.
1351                // However, on a single-CPU box (TODO or unless we are very lightly loaded)
1352                // then we run for much less than the expected cycle time
1353                // to try to avoid hogging all the CPU, especially on a single-CPU server.
1354                final int availableProcessors = Runtime.getRuntime().availableProcessors();
1355                // Set a very low minimum time in 'minimal' mode,
1356                // ie be prepared to stop immediately when we make some measurable progress.
1357                final long minEndTime = System.currentTimeMillis() + 1 + (minimal ? 0 : (APPROX_POLL_CYCLE_MS/8));
1358                // Ensure that the end time is always after a minimum end time...
1359                final long endTime = minEndTime +
1360                    ((!minimal && !GenUtils.mustConservePower() && (availableProcessors > 1)) ?
1361                        (APPROX_POLL_CYCLE_MS*(2+GenUtils.log2Approx(availableProcessors))) :
1362                        (APPROX_POLL_CYCLE_MS/4));
1363    
1364                // Do one chunk of work...
1365                try { scorerWork.doChunk(endTime, minEndTime, Rnd.fastRnd.nextBoolean(), inboundScorerQueue); }
1366                catch(final Throwable t)
1367                    {
1368                    // Note any unexpected errors that get this far...
1369                    log.log("ScorerCacheImpl.poll(): unexpected exception "+t.getClass().getName()+": "+t.getMessage());
1370                    t.printStackTrace();
1371                    }
1372                }
1373            });
1374    
1375            }
1376    
1377        /**Called/polled periodically (of the order of 1Hz) to do donkey-work and background tasks.
1378         * In particular, this call drives the search for improved Scorers,
1379         * as well as performing housekeeping and maintaining caches to speed foreground tasks.
1380         * <p>
1381         * This launches its work in a low-priority daemon thread,
1382         * and limits the number of such concurrent work threads globally
1383         * by silently discarding any excess,
1384         * ie this call always returns quickly.
1385         * <p>
1386         * This routine should not be called (often) if the host system is under heavy load.
1387         */
1388        public final void poll()
1389            throws IOException
1390            {
1391            // Allow a big timeslice if not conserving power and got lots of free memory...
1392            evolve(GenUtils.mustConservePower() || !MemoryTools.lotsFree());
1393            }
1394    
1395    
1396        /**Thread-safe, bounded size, queue for inbound Scorers; never null.
1397         * The upper bound on size should usually be set to be a small fraction of population size
1398         * on the grounds that we are unlikely to be able to quickly accept a large volume of new entrants
1399         * in any sensible way.
1400         */
1401        private final ArrayBlockingQueue<String> inboundScorerQueue;
1402    
1403        /**Attempt to queue an externally-supplied Scorer value; returns true if accepted.
1404         * This attempts to save the value supplied in our internal bounded-size queue.
1405         * <p>
1406         * Note that this accepts (and discards) anything which is definitely never usable,
1407         * eg syntactically invalid or not for a valid Scorer base name,
1408         * so as to help conserve resources such as memory and queue space.
1409         * <p>
1410         * This routine should always be relatively quick/efficient;
1411         * much more work is performed when an item is dequeued,
1412         * presumably when we know that we have ample resources available.
1413         * <p>
1414         * This may try to start some work immediately if the queue is now getting full.
1415         * <p>
1416         * Should not throw any exceptions; invalid Scorer values are simply discarded with true returned.
1417         */
1418        @Override
1419        public boolean offerExternalScorer(final String externalScorerNameAndParameters)
1420            {
1421            // If the queue is full then indicate that we cannot accept the Scorer value.
1422            if(inboundScorerQueue.remainingCapacity() == 0) { return(false); }
1423            // Validate and queue the value if possible.
1424            try
1425                {
1426                // Check that the Scorer value is parseable.
1427                final Pair<String, Map<String, String>> nap = AbstractScorer.parseNameAndParameters(externalScorerNameAndParameters);
1428                // Immediately absorb/discard unparsable Scorer or with invalid base name.
1429                if(!fixedSimpleScorers.containsKey(nap.first)) { return(true); }
1430                // Canonicalise (and hopefully minimise the size of) the Scorer; discard value if not possible.
1431                final ScorerIF sc = getScorerInstance(externalScorerNameAndParameters);
1432                if(sc == null) { return(true); }
1433                final String canon = AbstractScorer.canonicalise(sc);
1434                // Discard if already known to us at all, even as a stale value.
1435                if(null != population.getScorerWeighting(canon, true)) { return(true); }
1436                // Now try to queue the value and report if we were successful.
1437                final boolean result = inboundScorerQueue.offer(canon);
1438                // If getting full or already so then try to start a little work right now
1439                // (in a low-priority async thread) in the hope of catching up ASAP,
1440                // though not if already horribly stressed or busy.
1441                if(!canAcceptMoreExternalScorers() && !MemoryTools.isMemoryStressed())
1442                    { evolve(GenUtils.mustConservePower()); }
1443                // Tell the caller whether their item was queued successfully.
1444                return(result);
1445                }
1446            // Absorb/discard unparsable values immediately.
1447            catch(final IllegalArgumentException e) { return(true); }
1448            }
1449    
1450        /**Returns true if this cache can definitely accept (many) more externally-supplied Scorer values.
1451         * Even if this returns false we may in practice be able to accept
1452         * one or more new values: this is indicative.
1453         * <p>
1454         * True if our internal queue is at most about half full,
1455         * but not any sort of guarantee that we can actually accept another Scorer.
1456         * <p>
1457         * In power-conserving mode only welcome new entries if the queue is currently empty,
1458         * ie if we seem to be keeping up with any data coming in.
1459         * This is important because this call can be used to decide
1460         * when to hand out new work to clients,
1461         * from which the results will arrive only much later.
1462         */
1463        @Override
1464        public boolean canAcceptMoreExternalScorers()
1465            {
1466            if(GenUtils.mustConservePower()) { return(inboundScorerQueue.isEmpty()); }
1467            return(inboundScorerQueue.size() < inboundScorerQueue.remainingCapacity());
1468            }
1469    
1470        /**Returns true if at least once external Scorer is queued waiting to be processed. */
1471        @Override
1472        public boolean hasQueuedExternalScorer() { return(!inboundScorerQueue.isEmpty()); }
1473        }