001 /*
002 Copyright (c) 1996-2009, Damon Hart-Davis
003 All rights reserved.
004
005 Redistribution and use in source and binary forms, with or without
006 modification, are permitted provided that the following conditions are
007 met:
008
009 * Redistributions of source code must retain the above copyright
010 notice, this list of conditions and the following disclaimer.
011
012 * Redistributions in binary form must reproduce the above copyright
013 notice, this list of conditions and the following disclaimer in the
014 documentation and/or other materials provided with the
015 distribution.
016
017 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
018 IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
019 TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
020 PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
021 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
022 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
023 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
024 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
025 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
026 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
027 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
028 */
029
030 package org.hd.d.pg2k.ai.scorer;
031
032 import java.io.IOException;
033 import java.util.ArrayList;
034 import java.util.BitSet;
035 import java.util.Collections;
036 import java.util.Comparator;
037 import java.util.HashMap;
038 import java.util.HashSet;
039 import java.util.List;
040 import java.util.Map;
041 import java.util.Random;
042 import java.util.Set;
043 import java.util.SortedMap;
044 import java.util.TreeMap;
045 import java.util.concurrent.ArrayBlockingQueue;
046 import java.util.concurrent.ConcurrentHashMap;
047 import java.util.concurrent.ConcurrentMap;
048 import java.util.concurrent.ConcurrentSkipListMap;
049
050 import org.hd.d.pg2k.ai.scorer.fixed.FixedScore;
051 import org.hd.d.pg2k.ai.scorer.fixed.MaxScore;
052 import org.hd.d.pg2k.ai.scorer.fixed.MinScore;
053 import org.hd.d.pg2k.ai.scorer.fixed.NoConfidence;
054 import org.hd.d.pg2k.ai.scorer.fixed.RandomPositiveScore;
055 import org.hd.d.pg2k.ai.scorer.fixed.RandomScore;
056 import org.hd.d.pg2k.ai.scorer.fixed.ZeroScore;
057 import org.hd.d.pg2k.ai.scorer.parameterised.LocalSampler;
058 import org.hd.d.pg2k.ai.scorer.parameterised.SimpleExposure;
059 import org.hd.d.pg2k.svrCore.AllExhibitImmutableData;
060 import org.hd.d.pg2k.svrCore.AllExhibitProperties;
061 import org.hd.d.pg2k.svrCore.CoreConsts;
062 import org.hd.d.pg2k.svrCore.ExhibitName;
063 import org.hd.d.pg2k.svrCore.ExhibitPropsComputableMutable;
064 import org.hd.d.pg2k.svrCore.GenUtils;
065 import org.hd.d.pg2k.svrCore.MemoryTools;
066 import org.hd.d.pg2k.svrCore.Name;
067 import org.hd.d.pg2k.svrCore.Rnd;
068 import org.hd.d.pg2k.svrCore.SimpleLoggerIF;
069 import org.hd.d.pg2k.svrCore.TextUtils;
070 import org.hd.d.pg2k.svrCore.ThreadUtils;
071 import org.hd.d.pg2k.svrCore.ExhibitPropsComputableMutable.Factor;
072 import org.hd.d.pg2k.svrCore.Name.ExhibitFull;
073 import org.hd.d.pg2k.svrCore.Name.ExhibitShort;
074 import org.hd.d.pg2k.svrCore.Tuple.Pair;
075 import org.hd.d.pg2k.svrCore.Tuple.Triple;
076 import org.hd.d.pg2k.svrCore.datasource.SimpleExhibitPipelineIF;
077 import org.hd.d.pg2k.svrCore.vars.EventPeriod;
078 import org.hd.d.pg2k.svrCore.vars.EventVariableValue;
079 import org.hd.d.pg2k.svrCore.vars.SimpleVariableDefinition;
080 import org.hd.d.pg2k.svrCore.vars.SystemVariables;
081
082 import ORG.hd.d.IsDebug;
083
084
085 /**Simple/default implementation to compute (and cache) the score and confidence for exhibits.
086 * Note: since the result of this computation may be used in computing
087 * (EPCM) the ExhibitPropsComputableMutable value for an exhibit,
088 * then any implementation of this must avoid forcing recalculation
089 * of any EPCM value to avoid danger of infinite recursion
090 * (other than the static calcVoteFactor() method).
091 * Ideally the value computed will not depend on any EPCM value.
092 */
093 public final class ScorerCacheImpl extends AbstractScorerCache implements ScorerCacheIF
094 {
095 /**Construct an instance attached to the supplied data source.
096 * We automatically record/persist/share new best-of-breed Scorers via the event mechanism.
097 *
098 * @param dataSource full-access live data source; must not be null
099 */
100 public ScorerCacheImpl(final SimpleExhibitPipelineIF dataSource,
101 final SimpleLoggerIF log)
102 {
103 super(new ScorerPopulation(fixedSimpleScorers, MAX_SCORER_SCORES_RETAINED, dataSource),
104 dataSource,
105 log);
106
107 // Set up our work/evolution object.
108 scorerWork = new ScorerCreator.ScorerWork(this, log, true);
109
110 // Set up the inbound Scorer queue, size-limited.
111 inboundScorerQueue = new ArrayBlockingQueue<String>(MAX_SCORER_SCORES_RETAINED/16);
112
113 // Set up the calibration set cache, size-limited.
114 // We bound the size by allowing several (>2) set sizes for each base Scorer type.
115 _eCS_cache = MemoryTools.SimpleLRUMapAutoSizeForHitRate.<Triple<String,Integer,Boolean>, Pair<Long, Map<Name.ExhibitShort,ScoreAndConf>>>create(1+fixedSimpleScorers.size(), 1 + 5*fixedSimpleScorers.size(), "_eCS_cache");
116 }
117
118 /**Computes a weighted composite score [-1,+1] and confidence [0,+1] for the specified exhibit with the best available scorers/parameters; never null but may be (0,0).
119 * This uses the Scorers available from getCurrentScorersWithParameters(),
120 * but skipping any marked as BadScorer that have made it through filtering.
121 * <p>
122 * This explicitly limits the time it spends on computation,
123 * more so if the system is short of power.
124 * <p>
125 * This tries to force the evolution of new Scorers in the background
126 * if it finds none at all usable during filtering and computation.
127 * (This means that the Scorer system can bootstrap itself just by calls to this routine,
128 * but will force some possibly-wasteful work for exhibit types that cannot be scored.)
129 *
130 * @param exhibitName valid full exhibit name
131 * @param allowStale if true then allow a stale value from cache,
132 * else throw an exception if nothing is currently available
133 *
134 * @return (0,0) if named scorer is not available,
135 * cannot be used with the specified parameters,
136 * or cannot be applied to the indicated exhibit
137 * (eg because of the exhibit type or the exhibit does not exist);
138 * else returns a non-null ScoreAndConf value
139 */
140 public ScoreAndConf computeCompositeScoreAndConfidence(final Name.ExhibitFull exhibitName,
141 final boolean allowStale)
142 throws IOException
143 {
144 if(!ExhibitName.validNameSyntax(exhibitName)) { throw new IllegalArgumentException("exhibit name must be full, not short"); }
145
146 final long startTime = System.currentTimeMillis();
147 final long endTime = startTime + 1 +
148 (GenUtils.mustConservePower() ? 0 : CoreConsts.MAX_INTERACTIVE_DELAY_MS/4);
149
150 // We apply each available non-zero-confidence Scorer to the exhibit,
151 // and compute a single overall score.
152 // We don't cache this result;
153 // we recompute it each time as it should be quick to so do
154 // as all of its components are cached.
155 // Note that this may be VERY slow if it forces all Scorers to be (re)evaluated.
156
157 // Compute results from individual eligible Scorers.
158 // Because of the lack of symmetry in scoring
159 // (ie reversing the value of a bad Scorer does not necessarily make it a good one)
160 // we are only interested in the highest-confidence, positive-correlation Scorers.
161 final SortedMap<ScoreAndConf, ScorerIF> results = new TreeMap<ScoreAndConf, ScorerIF>(ScoreAndConf.ByConfidence);
162 Set<String> currentScorersWithParameters = getCurrentScorersWithParameters(allowStale);
163 // As a bootstrap measure on an (effectively) empty population
164 // try to force evaluation of the base Scorers here,
165 // some of which may have some predictive merit anyway.
166 // We also try to kick off a background thread to create new Scorers.
167 if(currentScorersWithParameters.isEmpty())
168 {
169 // Try the base Scorer set as a fallback.
170 currentScorersWithParameters = getBaseScorersWithoutParameters();
171 // Try to kick-start Scorer creation in the background with a reasonable chunk of CPU.
172 evolve(GenUtils.mustConservePower());
173 }
174 // Now evaluate this exhibit with each Scorer,
175 // in a random order to avoid bias in case we run out of time at some point.
176 final List<String> shuffledCSWP = new ArrayList<String>(currentScorersWithParameters);
177 Collections.shuffle(shuffledCSWP, Rnd.fastRnd);
178 for(final String snp : shuffledCSWP)
179 {
180 // Compute the weighting over all available exhibits.
181 final ScoreAndConf scorerWeighting = computeScorerWeighting(snp, allowStale, null);
182 // Quickly reject those Scorers with a non-positive score or confidence.
183 if(scorerWeighting.score <= 0) { continue; }
184 if(scorerWeighting.confidence == 0) { continue; }
185 // Get Scorer instance.
186 final ScorerIF sc = getScorerInstance(snp);
187 assert(sc != null);
188 // Now "score" the exhibit with this Scorer.
189 final ScoreAndConf score = computeUnweightedScoreAndConfidence(exhibitName, sc, allowStale);
190 // Apply the weighting to the score, rejecting any whose confidence is now zero.
191 // Note that we only use the sign of the Scorer correlation (score) for the score,
192 // but we multiply both the Scorer correlation and confidence into the final confidence.
193 final ScoreAndConf weighted = new ScoreAndConf(score.score,
194 (int) ((score.confidence * scorerWeighting.confidence * (long)scorerWeighting.score) / (ScoreAndConf.MAX * (long) ScoreAndConf.MAX)));
195 if(weighted.confidence == 0) { continue; }
196 assert(weighted.confidence > 0);
197 results.put(weighted, sc);
198
199 // Try to cap total runtime providing that we have at least one result so far...
200 if(System.currentTimeMillis() > endTime) { break; }
201 }
202
203 final int rSize = results.size();
204 // Handle trivial cases quickly...
205 // If no applicable results then return "no opinion",
206 // and try to quickly evolve some usable Scorers.
207 if(rSize == 0) { evolve(true); return(ScoreAndConf.NO_OPINION); }
208
209 // Get the keys (S&C values) in rising order by confidence (ignoring sign).
210 List<ScoreAndConf> sorted = new ArrayList<ScoreAndConf>(results.keySet());
211 assert(sorted.get(0).confidence <= sorted.get(sorted.size()-1).confidence);
212
213 // Discard the least-confident results if we have more than a few.
214 // The aim is to reduce noise and improve "strength" of answer.
215 // We do this BEFORE weeding out any "straw man" dummy/fake/test Scorers
216 // so as to help push out any very weak real Scorers (weaker than chance/faking).
217 // We don't discard too many Scorers so as to allow for multi-modal ranking
218 // by a reasonable diversity of Scorer types.
219 // A real Scorer is only guaranteed to make the result if better than all the fakes.
220 final int toDiscard = (rSize >>> 1); // FIXME: Optimise/document this fraction.
221 if(toDiscard > 0) { sorted = sorted.subList(toDiscard, rSize); }
222 // Now weed out any remaining dummy/stooge/test/benchmark Scorers.
223 for(int i = sorted.size(); --i >= 0; )
224 { if(results.get(sorted.get(i)) instanceof BadScorer) { sorted.remove(i); } }
225 // Recompute the size.
226 final int finalSize = sorted.size();
227
228 // Handle trivial cases quickly...
229 // If no applicable results then return "no opinion",
230 // and try to evolve some usable Scorers.
231 if(finalSize == 0) { evolve(true); return(ScoreAndConf.NO_OPINION); }
232 // If exactly one result then return it "as is".
233 if(finalSize == 1) { return(sorted.get(0)); }
234
235 // Get the highest confidence value.
236 final int highestConfidence = sorted.get(finalSize-1).confidence;
237 assert(sorted.get(0).confidence <= highestConfidence);
238 // Compute the result score as the mean score*confidence
239 // (thus contributing to the result in proportion to confidence)
240 // normalised to be score*1 for the highest confidence,
241 // and the confidence being the RMS confidence of all contributors
242 // (to be biased towards greater confidence in the result
243 // when a number of low-confidence Scorers are included).
244 // Work downward from the best, stopping if we run out of time.
245 long totalSC = 0;
246 long totalConfSq = 0;
247 int valuesUsed = 0;
248 for(int i = finalSize; --i >= 0; )
249 {
250 final ScoreAndConf sc = sorted.get(i);
251 totalSC += (((sc.score * sc.confidence) * (long)ScoreAndConf.MAX) / highestConfidence);
252 totalConfSq += (sc.confidence * sc.confidence);
253 ++valuesUsed;
254 // Try to cap total runtime.
255 if(System.currentTimeMillis() > endTime) { break; }
256 }
257 assert(valuesUsed > 0);
258 // Normalise the score prediction to be able to run the full gammut [-MAX,+MAX].
259 final long rawCompositeScore = totalSC / (ScoreAndConf.MAX * valuesUsed);
260 //if(IsDebug.isDebug) { System.out.println("rSize = " + finalSize); }
261 //if(IsDebug.isDebug) { System.out.println("rawCompositeScore = " + rawCompositeScore); }
262 final long rawCompositeConf = Math.round(Math.sqrt(totalConfSq / valuesUsed));
263 //if(IsDebug.isDebug) { System.out.println("rawCompositeConf = " + rawCompositeConf); }
264 return(new ScoreAndConf((short) Math.min(ScoreAndConf.MAX, Math.max(-ScoreAndConf.MAX, rawCompositeScore)),
265 (short) Math.min(ScoreAndConf.MAX, Math.max(0, rawCompositeConf))));
266 }
267
268
269 /**Maximum number of Scorers for which the system retains/caches scores; strictly positive.
270 * This limits the amount of memory dedicated to cache, etc.
271 * <p>
272 * In practice 2^13--2^16 seems to be a reasonable limit.
273 */
274 private static final int MAX_SCORER_SCORES_RETAINED = (1 << 13);
275
276 /**The minimum recent period for which we collect all available votes, in ms; strictly positive.
277 * This is usually at least a year to smooth out for seasonal patterns.
278 */
279 private static final long MIN_VOTE_SAMPLE_NEAR_TERM_TIME_MS = 365242L * 24 * 3600; // 365.242 days in a year.
280
281 /**ScoreAndConfidence for the given scorer itself over all exhibit types; never null but may be (0,0) where the scorer is unknown or untested.
282 * Essentially the result of this should be multiplied by the result for each exhibit
283 * (for the same scorer and parameters)
284 * to normalise the predicted score and confidence for the exhibit,
285 * though usually the sign of the correlation (score) is multiplied into the raw score,
286 * while the value of correlation (score) is multiplied with the confidence
287 * into the final confidence.
288 * <p>
289 * We attempt to ignore small numbers of errors for robustness.
290 *
291 * @param allowStale if true then allow a stale or low-confidence value from cache,
292 * else throw an exception if nothing is currently available
293 * and we cannot quickly compute enough points to increase our confidence
294 * @return the score represents the correlation with the underlying votes
295 * (and whatever the scoring is measured against)
296 * with MAX meaning perfect correlation, 0 meaning no correlation,
297 * and -MAX meaning perfectly wrong answers all the time,
298 * and the confidence 0 if we have no (or very/too few) data points
299 * and approaching MAX as we have a large (enough) number of data points
300 */
301 public ScoreAndConf computeScorerWeighting(final ScorerIF scorer,
302 final boolean allowStale, final String source)
303 throws IOException
304 {
305 if(scorer == null) { throw new IllegalArgumentException(); }
306
307 // Return what we have in cache, if anything.
308 final ScoreAndConf cachedValue = population.getScorerWeighting(scorer, allowStale);
309 if(cachedValue != null) { return(cachedValue); }
310
311 final AllExhibitImmutableData aeid = dataSource.getAllExhibitImmutableData(-1);
312 // If there are no live exhibits then we can't compute (nor cache) a result.
313 // This is especially important to avoid cacheing spurious false results at start-up.
314 if(aeid.length == 0) { return(ScoreAndConf.NO_OPINION); }
315
316 // Compute weighting (effectively the correlation) from scratch.
317 // Use a decent-quality subset of exhibits when the system is short of memory or power,
318 // in particular because this is likely to help reduce memory pressure
319 // from reducing the number of benchmark images being cached and checked against.
320 final boolean trim = (!MemoryTools.lotsFree()) || GenUtils.mustConservePower();
321 final Map<Name.ExhibitShort, ScoreAndConf> benchmarkExhibits = getVotedForExhibitsAndVoteFactors(allowStale, trim);
322 // If there are no voted-for exhibits then we cannot compute (nor cache) a result.
323 if(benchmarkExhibits.isEmpty()) { return(ScoreAndConf.NO_OPINION); }
324
325 // Compute the calibration result.
326 final Pair<ScoreAndConf, Boolean> weighting = ScorerCreator.computeWeighting(benchmarkExhibits, this, scorer, aeid, allowStale);
327 final ScoreAndConf result = weighting.first;
328 final boolean enoughData = !weighting.second;
329
330 // Atomically store/replace the result in cache
331 // iff enough data points were available and we hadn't run out of time, etc.
332 // We allow values computed with potentially-stale inputs to be cached
333 // as probably reasonably accurate if other conditions are met.
334 // Note that if the Scorer that we have just generated is the best
335 // then it may be persisted/propagated automatically by the population object.
336 if(enoughData)
337 {
338 final boolean isBest = population.putScorerWeighting(scorer, result, dataSource, log);
339 if(isBest)
340 {
341 log.log("[Computed/cached 'best' Scorer from source '"+source+"'; score and confidence "+result+"; "+scorer.getNameAndParameters()+" .]");
342 }
343 }
344
345 return(result);
346 }
347
348
349 /**Compute voted-for-exhibit set; never null but may be empty.
350 * This attempts to retrieve the latest vote event sets for all or most time slots
351 * (concentrating on more recent ones if necessary)
352 * and computes the set of all exhibits that have a vote.
353 * <p>
354 * This does <em>NOT</em> attempt to filter the result in any way,
355 * ie by which exhibits are still live or even syntactically-valid.
356 *
357 * @return the (non-null, possibly empty) set of voted-for-exhibits (short names)
358 * up to the last full VLONG slot
359 * paired with the time at which we started to compute/fetch it
360 * @throws IOException if the data is not available or there is a timeout
361 */
362 private Pair<Long, Set<Name.ExhibitShort>> _computeRawVotedForExhibits(final SimpleExhibitPipelineIF vars)
363 throws IOException
364 {
365 final long startTime = System.currentTimeMillis();
366 final Set<Name.ExhibitShort> exhibits = new HashSet<Name.ExhibitShort>(2*MIN_VOTED_FOR_SET_SIZE);
367
368 // Get vote events for all retained full/complete (VLONG) slots.
369 final long currentPeriod = EventPeriod.VLONG.getIntervalNumber(startTime);
370 final BitSet whichValues = new BitSet(SystemVariables.EVENT_SAMPLES_RETAINED);
371 whichValues.set(0, SystemVariables.EVENT_SAMPLES_RETAINED);
372 final SimpleVariableDefinition[] defs = new SimpleVariableDefinition[]{ SystemVariables.VOTE_PRO, SystemVariables.VOTE_CON };
373 for(final SimpleVariableDefinition def : defs)
374 {
375 final EventVariableValue evvsAll[] = vars.getEventValues(
376 def, EventPeriod.VLONG, currentPeriod, whichValues);
377 for(final EventVariableValue evv : evvsAll)
378 {
379 if((evv != null) && (evv.getTotalEventCount() != 0))
380 {
381 // We expect the keys to be short exhibit names
382 // which we convert to Name.ExhibitShort values.
383 final AllExhibitImmutableData aeid = vars.getAllExhibitImmutableData(-1);
384 for(final Object o : evv.getDistinctValuesInRankOrder())
385 {
386 final ExhibitFull fullName = aeid.getFullName((String) o);
387 if(null == fullName) { continue; } // Can only happen if the data set changes..
388 exhibits.add(fullName.getShortName());
389 }
390 }
391 }
392 }
393
394 return(new Pair<Long, Set<Name.ExhibitShort>>(startTime, exhibits));
395 }
396
397 /**Cache of untrimmed voted-for exhibits (by short name) and their converted vote factors; never null.
398 * The exhibit set is those that have been voted for or against,
399 * possibly including no-longer valid/extant exhibit names,
400 * because we will be using this for calibration.
401 * <p>
402 * Private to getVotedForExhibits().
403 * <p>
404 * We index from the time at which we gathered the set
405 * so that we can regenerate or trim this list if the AEP or event slot changes.
406 * <p>
407 * Because the (first) map is sorted it is quick to find the latest/oldest entries.
408 * <p>
409 * Usually we only keep one value (the latest one),
410 * and any older values are purged only AFTER a new value has been inserted,
411 * ie this map never becomes transiently empty nor more stale than necessary once non-empty.
412 * <p>
413 * We can keep older information around to allow some (stale) results to be computed
414 * even if we cannot immediately fetch new information.
415 * <p>
416 * The outer and inner maps are thread-safe.
417 * <p>
418 * Do not assume that other activity in this collection can be locked out
419 * by holding a lock on this object. DO NOT hold a lock on this object!
420 * <p>
421 * The values must be immutable to enable them to be shared safely without copying.
422 * <p>
423 * TODO: clear this safely when memory is stressed.
424 */
425 private final SortedMap<Long, Map<Name.ExhibitShort, ScoreAndConf>> _votedForExhibitsUntrimmed =
426 new ConcurrentSkipListMap<Long, Map<Name.ExhibitShort,ScoreAndConf>>();
427
428 /**Cache of trimmed voted-for exhibits (by short name) and their converted vote factors; never null.
429 * Cache of trimmed values otherwise as for _votedForExhibitsUntrimmed.
430 * <p>
431 * Only pure votes are in this selection; no proxies/substitutes.
432 */
433 private final SortedMap<Long, Map<Name.ExhibitShort, ScoreAndConf>> _votedForExhibitsTrimmed =
434 new ConcurrentSkipListMap<Long, Map<Name.ExhibitShort,ScoreAndConf>>();
435
436 /**Nominal amount of heap space (bytes) to be free to accommodate one voted-for exhibit as well as normal Gallery operations; strictly positive.
437 * No more than say 25% to 50% of this amount of heap space should actually be used
438 * (eg by cached data such as expanded/sampled images for quick processing, and other artifacts)
439 * for each member of the voted-for set.
440 */
441 private static final int FREE_HEAP_PER_VFE = 1024 * 1024;
442
443 /**Maximum heap space per voted-for exhibit to support AI Scorer work in bytes; strictly positive.
444 * This should cover all memory resources,
445 * but is not an absolute per item limit nor an invitation to use up to the limit.
446 */
447 public static final int MAX_BYTES_PER_VOTED_FOR_EXHIBIT = FREE_HEAP_PER_VFE / 2;
448
449 /**If true then fold in download/viewing stats with explicit voting.
450 * This may make for a more robust and topical/accurate view of popularity.
451 */
452 private static final boolean ALLOW_VIEWING_STATS_ALONGSIDE_VOTES = true;
453
454 /**Maximum confidence (+ve/-ve) for vote stand-in data in range [0,ScoreAndConf.MAX].
455 * We will allow our confidence in a stats-derived ersatz 'vote' to be no higher than this,
456 * nor even as high as any actual real vote factors confidence value,
457 * ie so real votes beat synthetic ones every time.
458 * <p>
459 * Usually a small fraction of ScoreAndConf.MAX to prefer real votes by a huge margin.
460 */
461 private static final short MAX_NON_VOTE_CONF = ScoreAndConf.MAX >>> 3;
462
463 /**If true then when allowing viewing stats alongside votes we can score exhibits for not being viewed, etc.
464 * One problem with generating negative scores for exhibits without positive information
465 * is that we may hugely inflate the universe of exhibits to be examined
466 * (with an inflation in CPU/memory/other resources consumed)
467 * without necessarily adding much value or information.
468 * <p>
469 * In particular the memory burden of holding many thumbnails in memory for processing
470 * may be significant.
471 * <p>
472 * This should usually be false.
473 */
474 private static final boolean ALLOW_FILL_IN_FOR_ABSENT_STATS_AND_VOTES = false;
475
476 /**Get latest (immutable) Map of all voted-for exhibits (by short name) to vote score; never null.
477 * Returns a set of extant exhibits that have been voted for or against.
478 * <p>
479 * Uses a cache to avoid wasted computation from one event tick to the next,
480 * and to allow this mechanism to work even in the face of brief outages
481 * in data connectivity (eg when trying to fetch new values of the vote events).
482 * <p>
483 * We convert the vote Factor to a ScoreAndConf value so that more computations
484 * can be done as integer rather than floating, especially important for Niagara.
485 *
486 * @param allowStale if true, allow older data to be returned to save time,
487 * or if new event data should be used but cannot be fetched
488 * (ie allow working from stale event data for speed and/or robustness)
489 * @param trim if true then trim the result to a minimal useful set
490 * (ie typically no more than MIN_VOTED_FOR_SET_SIZE, dropping low-confidence values),
491 * else we may supplement the vote data with other usage stats
492 * if the result size would be MIN_VOTED_FOR_SET_SIZE
493 */
494 private Map<Name.ExhibitShort, ScoreAndConf> getVotedForExhibitsAndVoteFactors(final boolean allowStale,
495 final boolean trim)
496 throws IOException
497 {
498 final long startTime = System.currentTimeMillis();
499
500 // What is the current period (ie for non-stale data)?
501 final long currentPeriod = EventPeriod.VLONG.getIntervalNumber(startTime);
502
503 // Select appropriate (trimmed or full) value cache.
504 final SortedMap<Long, Map<Name.ExhibitShort, ScoreAndConf>> _votedForExhibits =
505 trim ? _votedForExhibitsTrimmed : _votedForExhibitsUntrimmed;
506 // Spin to retrieve latest cached result that is current (up-to-date).
507 // (If allowStale is true, we allow a previous-tick result.)
508 // This relies on:
509 // 1) the map never going empty once non-empty
510 // 2) the map never getting more stale
511 // (refetching the last key can only get the same or a newer value)
512 // This can only return with a "current" (enough) item.
513 // We fall through if an appropriate value is not available.
514 if(!_votedForExhibits.isEmpty())
515 {
516 while(EventPeriod.VLONG.getIntervalNumber(_votedForExhibits.lastKey()) >= (allowStale ? currentPeriod-1 : currentPeriod))
517 {
518 final Map<Name.ExhibitShort, ScoreAndConf> result = _votedForExhibits.get(_votedForExhibits.lastKey());
519 if(null != result) { return(result); }
520 }
521 }
522
523 // No suitable cached up-to-date value,
524 // so try to compute one (and cache it) now.
525 // If this cannot complete (generates an exception) and allowStale is false
526 // then we propagate that exception upwards.
527 Pair<Long, Set<Name.ExhibitShort>> rawSet = null;
528 IOException err = null;
529 if(!allowStale)
530 { rawSet = _computeRawVotedForExhibits(dataSource); }
531 else
532 { try { rawSet = _computeRawVotedForExhibits(dataSource); } catch(final IOException e) { err = e; /* Record for later. */ } }
533
534 // If we got something, then cache it, even though it *may* now not be the newest.
535 if(rawSet != null)
536 {
537 final AllExhibitProperties aep = dataSource.getAllExhibitProperties(-1);
538 // We refuse to cache the result from an empty AEP as possibly erroneous.
539 if(aep.aeid.length == 0) { return(Collections.emptyMap()); }
540
541 final Set<Name.ExhibitShort> vfe = rawSet.second;
542 final Map<Name.ExhibitShort, ScoreAndConf> m = new HashMap<Name.ExhibitShort, ScoreAndConf>(2 * vfe.size());
543 for(final Name.ExhibitShort shortName : vfe)
544 {
545 // If the exhibit is missing (eg has been deleted or renamed) then skip it.
546 final Name.ExhibitFull fullName = aep.aeid.getFullName(shortName);
547 if(fullName == null) { continue; }
548
549 // Compute vote-based rating.
550 final Factor voteFactor = ExhibitPropsComputableMutable.calcVoteFactor(fullName, aep, dataSource, MIN_VOTE_SAMPLE_NEAR_TERM_TIME_MS);
551 final int confidence = Math.min(ScoreAndConf.MAX, Math.max(0, Math.round(voteFactor.correlation * ScoreAndConf.MAX)));
552 // If vote correlation for this exhibit is too low to use then skip it.
553 if(confidence == 0) { continue; }
554 final int score = Math.min(+ScoreAndConf.MAX, Math.max(-ScoreAndConf.MAX, Math.round(voteFactor.goodness * ScoreAndConf.MAX)));
555 final ScoreAndConf sac = new ScoreAndConf(score, confidence);
556
557 // Store a useful vote factor in the map.
558 m.put(shortName, sac);
559 //if(IsDebug.isDebug && (voteFactor.goodness > -1) && (voteFactor.goodness < +1)) { System.out.println("Middling vote Factor = "+voteFactor+" for "+fullName); }
560 }
561
562 // How many pure votes are available...
563 final int pureVoteSetSize = m.size();
564
565 // The target result size.
566 // When trimming we have this the minimum acceptable size,
567 // else we potentially expand it to make use of abundant free heap space.
568 final int targetVFSSize = trim ? MIN_VOTED_FOR_SET_SIZE :
569 Math.max(MIN_VOTED_FOR_SET_SIZE, (int) Math.min(Runtime.getRuntime().freeMemory() / FREE_HEAP_PER_VFE, Integer.MAX_VALUE));
570 log.log("ScorerCacheImpl.getVotedForExhibitsAndVoteFactors(): pure vote factors vs target: "+pureVoteSetSize+" vs "+targetVFSSize);
571
572 // If we've been asked to trim the response
573 // and as gathered so far it is bigger than the minimum 'preferred' size,
574 // then we try to trim away the weakest (lowest confidence) items
575 // to pare the result back to minimum preferred/acceptable size.
576 if(trim && (pureVoteSetSize > targetVFSSize))
577 {
578 // Compute sorted list of all keys,
579 // sorted primarily in ascending order by confidence
580 // with a secondary, essentially-randomising sort
581 // by name hash and other transient data,
582 // allowing discard of exactly enough values to trim to MIN_VOTED_FOR_SET_SIZE
583 // with otherwise-tied-by-confidence victims effectively selected randomly.
584 final List<Name.ExhibitShort> namesByAscendingConf = new ArrayList<Name.ExhibitShort>(m.keySet());
585 Collections.sort(namesByAscendingConf, new Comparator<Name.ExhibitShort>() {
586 public int compare(final Name.ExhibitShort o1, final Name.ExhibitShort o2)
587 {
588 final ScoreAndConf sac1 = m.get(o1);
589 final ScoreAndConf sac2 = m.get(o2);
590 // Primary sort is by ascending confidence.
591 final int cDiff = sac1.confidence - sac2.confidence;
592 if(cDiff != 0) { return(cDiff); }
593 // Effectively pick a random order to choose victims exhibits to discard.
594 // This stays stable during any one sort,
595 // but will probably change from sort to sort
596 // as new ScoreAndConf instances are created.
597 final int h1 = o1.hashCode() ^ System.identityHashCode(sac1);
598 final int h2 = o2.hashCode() ^ System.identityHashCode(sac2);
599 if(h1 < h2) { return(-1); }
600 if(h1 > h2) { return(+1); }
601 // Force a total ordering.
602 return(TextUtils.compare(o1, o2));
603 }
604 });
605 // Zap the lowest-confidence entries.
606 final int toZap = pureVoteSetSize - targetVFSSize;
607 final List<Name.ExhibitShort> zapList = namesByAscendingConf.subList(0, toZap);
608 log.log("ScorerCacheImpl.getVotedForExhibitsAndVoteFactors(): trimming "+toZap+"...");
609 for(int i = 0; i < Math.min(5, toZap); ++i) { log.log("ScorerCacheImpl.getVotedForExhibitsAndVoteFactors(): sac: " + m.get(zapList.get(i)) + " for " + zapList.get(i)); }
610 m.keySet().removeAll(zapList);
611
612 log.log("ScorerCacheImpl.getVotedForExhibitsAndVoteFactors(): pure vote factors after trim: "+m.size());
613 }
614
615 // Else if we're allowing viewing stats of any sort as a substitute for (absent) votes
616 // and we don't have a reasonable number of voted-for exhibits
617 // and we haven't been requested to trim the response size,
618 // then if an exhibit has any showing in the long-term stats at all we take that to be positive,
619 // else we take an absence to be (slightly) negative (if allowed), weighted appropriately,
620 // but in any case less than one or few explicit votes would have been,
621 // ie we prefer real votes to any proxies.
622 // In this case we use catalogue-page viewing stats,
623 // high-volume and maybe a truer vox-pop than exhibit download.
624 else if(ALLOW_VIEWING_STATS_ALONGSIDE_VOTES &&
625 !trim &&
626 (pureVoteSetSize < targetVFSSize))
627 {
628 do { // Fake loop to allow a quick abort in case of difficulty...
629 final EventVariableValue[] aPop =
630 dataSource.getEventValues(SystemVariables.ACCESSPATTERN_CAT_PAGE_VIEW,
631 EventPeriod.VLONG,
632 0,
633 null);
634 // ABORT if we can't get the 'all' set of cat-page-view stats...
635 if((aPop.length == 0) || (aPop[0] == null))
636 { log.log("getVotedForExhibitsAndVoteFactors(): CANNOT FILL IN FOR TOO-FEW VOTES: no 'all' viewing stats."); break; }
637 final EventVariableValue viewingStats = aPop[0];
638
639 // Pad out our map with values driven by viewing stats,
640 // with a confidence capped to be less than the lowest non-zero real-vote confidence.
641
642 // First compute ready for our synthesised entries a confidence
643 // just (one) less than the lowest confidence (if any) derived from explicit votes.
644 // We don't allow this to be more than a low fixed ceiling.
645 // That allows any 'real' votes to 'win'.
646 short synthConf = MAX_NON_VOTE_CONF;
647 for(final ScoreAndConf sac : m.values())
648 {
649 final short aSac = (short) (Math.abs(sac.confidence) - 1);
650 if(aSac < synthConf) { synthConf = aSac; }
651 }
652 // ABORT if ceiling confidence is too low to use.
653 if(synthConf <= 0)
654 { log.log("getVotedForExhibitsAndVoteFactors(): CANNOT FILL IN FOR TOO-FEW VOTES: minimum real-vote confidence too low."); break; }
655
656 // Generate a value for every single exhibit if possible.
657 // Anything ranked over half-way up all possible exhibits is +ve, else -ve.
658 // (Most things probably can't rank because of the finite size of the recorded stats.)
659
660 // Fixed (and shared) low-ish 'unranked' penalty value (if allowing 'fill-in').
661 // As it happens we'll use the same magnitude as the confidence ceiling for now.
662 final ScoreAndConf unrankedPen = (!ALLOW_FILL_IN_FOR_ABSENT_STATS_AND_VOTES) ? null :
663 MemoryTools.intern(new ScoreAndConf(-MAX_NON_VOTE_CONF, synthConf));
664
665 final int exhibitCount = aep.aeid.size();
666 // Make the cut-off rank low enough to exclude at least half the exhibits,
667 // and possibly set even lower to reflect the fact that we may only need
668 // a small number of additional ersatz 'votes'
669 // to generate a reasonable sample-set size.
670 final int rankThreshold = Math.min(exhibitCount / 2, targetVFSSize - pureVoteSetSize);
671 for(final Name.ExhibitShort shortName : aep.aeid.getAllExhibitShortNamesArraySorted())
672 {
673 // If there is already a real-vote-derived value then we don't need to do anything.
674 if(m.containsKey(shortName)) { continue; }
675
676 // Synthesise a value in the absence of a real vote...
677 final int rank = viewingStats.getRank(shortName.toString()); // Lower rank is better, zero is top/best.
678 if(rank < rankThreshold)
679 {
680 final int score = Math.min(+ScoreAndConf.MAX, Math.max(1,
681 (int) ((ScoreAndConf.MAX * (long) (rankThreshold - rank)) / rankThreshold)));
682 // Store positive score value...
683 m.put(shortName, new ScoreAndConf(score, synthConf));
684 }
685 else if(ALLOW_FILL_IN_FOR_ABSENT_STATS_AND_VOTES)
686 {
687 m.put(shortName, unrankedPen); // No ranking: penalise mildly...
688 }
689 //log.log("getVotedForExhibitsAndVoteFactors(): computed synthetic ScoreAndConf "+m.get(shortName)+" for "+shortName);
690 }
691
692 log.log("ScorerCacheImpl.getVotedForExhibitsAndVoteFactors(): vote factors plus selected stats-derived factors: "+m.size());
693 } while(false); // End of fake loop to allow abort...
694 }
695
696 // Cache our new result (against our assumed-effectively-unique timestamp)...
697 final Map<Name.ExhibitShort, ScoreAndConf> result = Collections.unmodifiableMap(m);
698 _votedForExhibits.put(rawSet.first, result);
699 // Now zap anything older (ie stale).
700 // This does not remove our new entry, and therefore guarantees not to empty the map.
701 // Concurrent updates may leave multiple entries in the map until another update,
702 // but the map basically remains bounded in size.
703 Long oldest;
704 while((oldest = _votedForExhibits.firstKey()).longValue() < rawSet.first.longValue())
705 { _votedForExhibits.remove(oldest); }
706
707 // Return this known-good value.
708 return(result);
709 }
710
711 // We didn't manage to compute a new/current/up-to-date result,
712 // but the caller has allowed us to return a stale result,
713 // so return the latest value from the cache, if any.
714 assert(allowStale);
715 if(!_votedForExhibits.isEmpty())
716 {
717 while(true)
718 {
719 final Map<Name.ExhibitShort, ScoreAndConf> result = _votedForExhibits.get(_votedForExhibits.lastKey());
720 if(null != result) { return(result); }
721 }
722 }
723
724 // The cache was empty so rethrow the error that we got earlier
725 // when trying to compute a new/current/up-to-date value.
726 assert(err != null);
727 throw err;
728 }
729
730 /**The (small) immutable current fixed set of parameterless base Scorer instances.
731 * Note that since these are fixed and immutable and have no parameters,
732 * we can safely create and store instances here.
733 * <p>
734 * These include the test/bogus Scorers as well as the live Scorers.
735 */
736 public static final Map<String, ScorerIF> fixedSimpleScorers;
737 /**Initialise fixedSimpleScorers. */
738 static
739 {
740 final ScorerIF simpleScorers[] =
741 {
742 // Bogus/fake/test Scorers.
743 new NoConfidence(),
744 new RandomScore(),
745 new RandomPositiveScore(),
746 new ZeroScore(),
747 new MinScore(),
748 new MaxScore(),
749 new FixedScore(),
750
751 // True Scorers.
752 new SimpleExposure(),
753 new LocalSampler(),
754 // TODO: add new Scorer types here...
755 };
756 // We use a HashMap for speed of lookup.
757 final HashMap<String, ScorerIF> m = new HashMap<String, ScorerIF>(2 * simpleScorers.length);
758 for(final ScorerIF sc : simpleScorers)
759 { m.put(sc.getBaseName(), sc); }
760 fixedSimpleScorers = Collections.unmodifiableMap(m); // Make read-only.
761 }
762
763 /**Minimum desirable site of set of voted-for exhibits for calibration to be fully confident of the result; strictly positive.
764 * This may determine the sizing of various memory structures.
765 * <p>
766 * Though empirically 512 has been observed to be a good value previously,
767 * now trimmed as of 2009/10 with severe memory pressure at the 256MB-heap master.
768 */
769 static final int MIN_VOTED_FOR_SET_SIZE = 256;
770
771 /**Base set of available Scorers' names (no parameters); never null but may be empty.
772 * The values returned are of the form <i>ScorerName</i>.
773 */
774 public Set<String> getBaseScorersWithoutParameters()
775 { return(fixedSimpleScorers.keySet()); }
776
777 /**Get base non-parameterised Scorer by name; null if no such base Scorer supported.
778 * @param baseName base (no parameters) name of Scorer; must not be null
779 */
780 public ScorerIF getBaseScorerByName(final String baseName)
781 { return(fixedSimpleScorers.get(baseName)); }
782
783 /**Iterations allowed in extractCalibrationSet() to try to find an optimal result; strictly positive.
784 * This should probably be at least 3 or 4; and can much higher if calculation is quick.
785 */
786 private static final int MAX_ECS_ITERATIONS = 4;
787
788 /**If true then select a partly-random selection of Scorers to measure error in _computeCalibrationSetError(). */
789 private static final boolean STATISTICAL_SCORER_SAMPLING_CCS = false;
790
791 /**Max Scorer sample size in _computeCalibrationSetError() to prevent overflow in calculations. */
792 private static final int MAX_SCORER_SAMPLE_SIZE_CCS = 1 << (62 /*Allow sign bit + 1 spare*/ / 4);
793
794 /**Compute 'error' in calibration set for the given population and cache contents; non-negative.
795 * This regards a computed calibration set as 'perfect' (zero error) if:
796 * <ul>
797 * <li>It predicts the same Scorers in the same order for the given basename
798 * (or with the same composite score if the supplied base name is null).
799 * We will generally take a sampling of Scorers of similar size
800 * to the calibration exhibit set.
801 * <li>It marks the best Scorer as the best (or very nearly so),
802 * ie so that anything that seems better than the 'best' Scorer by the computed set
803 * will very likely prove to be so when measured against the full set.
804 * </ul>
805 * <p>
806 * This implicitly assumes that using the population ranking/comparator
807 * is the same as calibrating against the input calibration data
808 * (though that may prove false for stale cache values, etc).
809 * This avoids potentially-expensive recomputation against all input data.
810 * <p>
811 * Note that nothing is ever removed from the workspace (if supplied)
812 * making it (thread-) safe to share even between concurrent calls,
813 * but meaning also that its data may be bulky and get stale
814 * if preserved longer than the call to extract one calibration set.
815 * Use of this retained workspace should however save a great deal of time.
816 *
817 * @param putativeCalibrationExhibits map of proposed exhibits (short names)
818 * for calibration set to their real original input data (eg votes);
819 * never null nor empty
820 * @param breedingSet set of 'best' Scorers (of a single base type) or null for a generic set;
821 * never empty
822 * @param workspace if non-null then this is some workspace opaque to the caller
823 * that can be used to save some results from one call to the next
824 * (or even between concurrent calls, though this may not be efficient)
825 * while evaluating the error for a single calibration set;
826 * the caller must not alter this object
827 *
828 * @return zero for an apparently 'perfect' computed calibration set,
829 * with increasingly positive values for increasingly-badly-performing computed calibration sets
830 */
831 private int _computeCalibrationSetError(final Map<Name.ExhibitShort, ScoreAndConf> putativeCalibrationExhibits,
832 final List<String> breedingSet,
833 final ConcurrentMap<String, ConcurrentMap<Name.ExhibitShort, ScoreAndConf>> workspace)
834 throws IOException
835 {
836 assert((putativeCalibrationExhibits != null) && !putativeCalibrationExhibits.isEmpty());
837 assert((breedingSet == null) || !breedingSet.isEmpty());
838
839 // For the moment this cannot cope with the generic (no-breeding-set) case
840 // so we regard the proposed calibration set as error-free since we cannot improve on it.
841 if(breedingSet == null)
842 {
843 if(IsDebug.isDebug) { System.err.println("WARNING: _computeCalibrationSetError() cannot yet handle generic case"); }
844 return(0);
845 }
846
847 // Sort breeding set by descending rank/goodness using the population's comparator.
848 // Should probably be in-order anyway, but this ensures consistency.
849 final List<String> sortedBreedingSet = new ArrayList<String>(breedingSet);
850 // Extract the 'best' Scorer in the population
851 // so that we can ensure that it remains 'best' when measured by the calibration set.
852 final String bestScorer = sortedBreedingSet.get(0);
853
854 final int pCESize = putativeCalibrationExhibits.size();
855 // Compute the ordered sampling of Scorers that we will use to measure calibration error.
856 final int breedingSetSize = sortedBreedingSet.size();
857 // Our final Scorer sample cannot be larger than the breeding set size...
858 final int maxScorerSampleSize = Math.min(Math.min(pCESize, breedingSetSize), MAX_SCORER_SAMPLE_SIZE_CCS);
859 final List<String> scorerSample = new ArrayList<String>(maxScorerSampleSize);
860 // The target size is ideally proportional to the exhibit size.
861 if(breedingSetSize <= maxScorerSampleSize)
862 {
863 // If there are fewer available Scorers than our target size
864 // then use them all, in order.
865 scorerSample.addAll(sortedBreedingSet);
866 }
867 else if(STATISTICAL_SCORER_SAMPLING_CCS)
868 {
869 // Indices of Scorers to use.
870 final BitSet samples = new BitSet(breedingSetSize);
871 // Always include the 'best' Scorer in the sample that we test
872 // since we always want the calibration set to recognise it as 'best'
873 // so that any Scorer that seems better when measured against the calibration set
874 // probably is better when compared against the full input (votes) data.
875 samples.set(0);
876 int bitsSet = 1;
877 // Choose the remaining Scorer samples at random,
878 // but limit the time that we spend, especially if we want nearly all of them.
879 // It doesn't matter if we end up with a slightly undersized sample set.
880 assert(samples.cardinality() == bitsSet);
881 for(int rounds = 4*maxScorerSampleSize; (rounds-- >= 0) && (bitsSet < maxScorerSampleSize); )
882 {
883 // Select a putative index (after the first one, always included).
884 final int b = 1 + Rnd.goodRnd.nextInt(breedingSetSize-1);
885 // Try again if already taken.
886 if(samples.get(b)) { continue; }
887 // Else if new, mark the bit.
888 samples.set(b);
889 ++bitsSet;
890 }
891 assert(samples.cardinality() == bitsSet);
892 assert(bitsSet <= maxScorerSampleSize);
893 // Now walk the bitset, copying in the samples in order.
894 for(int i = samples.nextSetBit(0); i >= 0; i = samples.nextSetBit(i+1))
895 { scorerSample.add(sortedBreedingSet.get(i)); }
896 assert(bitsSet == scorerSample.size());
897 }
898 else
899 {
900 // Just take the 'best'/top Scorers.
901 scorerSample.addAll(sortedBreedingSet.subList(0, maxScorerSampleSize));
902 }
903 final int scorerSampleSize = scorerSample.size();
904 assert(scorerSampleSize > 0);
905 assert(maxScorerSampleSize > 0);
906 assert(scorerSampleSize <= maxScorerSampleSize); // FIXME: failing
907
908 // Map from Scorer name to map from (short) exhibit name to Scorer's predictions for that name.
909 // This allows us to perform an exhibit-ordered loop,
910 // so that we test the same exhibit with each sample Scorer in turn
911 // for best memory locality/performance.
912 // NOTE THAT WE NEVER REMOVE OR OVERWRITE ANY EXTANT ENTRY IN THIS MAP OR SUB-MAPS.
913 final ConcurrentMap<String, ConcurrentMap<Name.ExhibitShort, ScoreAndConf>> byScorerScores;
914 if(workspace == null)
915 {
916 // No workspace supplied; create our own temporary workspace now.
917 byScorerScores = new ConcurrentHashMap<String, ConcurrentMap<Name.ExhibitShort,ScoreAndConf>>(2*scorerSampleSize);
918 }
919 else { byScorerScores = workspace; }
920 // Ensure that we have a sub-map set up for each Scorer in this sample.
921 for(final String snp : scorerSample)
922 {
923 if(byScorerScores.get(snp) == null)
924 { byScorerScores.putIfAbsent(snp, new ConcurrentHashMap<Name.ExhibitShort, ScoreAndConf>(2*pCESize)); }
925 }
926 final AllExhibitImmutableData aeid = dataSource.getAllExhibitImmutableData(-1);
927 for(final Name.ExhibitShort exhibitShortName : putativeCalibrationExhibits.keySet())
928 {
929 final Name.ExhibitFull exhibitName = aeid.getFullName(exhibitShortName);
930 if(exhibitName == null) { continue; }
931 for(final String snp : scorerSample)
932 {
933 final ConcurrentMap<Name.ExhibitShort, ScoreAndConf> submap = byScorerScores.get(snp);
934 // If we already have a result, avoid expensively recomputing it.
935 if(submap.get(exhibitShortName) != null) { continue; }
936 // We allow stale data for speed and robustness.
937 final ScoreAndConf sac = computeUnweightedScoreAndConfidence(exhibitName, population.getScorerInstance(snp), true);
938 submap.putIfAbsent(exhibitShortName, sac);
939 }
940 }
941 // Compute the overall ScoreAndConf weighting (accuracy) for each sample Scorer.
942 final Map<String, ScoreAndConf> byScorerWeighting = new HashMap<String, ScoreAndConf>(2*scorerSampleSize);
943 for(final String snp : scorerSample)
944 {
945 final ScoreAndConf weight = ScorerCreator.computeWeighting(byScorerScores.get(snp), putativeCalibrationExhibits);
946 byScorerWeighting.put(snp, weight);
947 }
948 // Now compute the ranking of the sample Scorers using the calibration set.
949 // The 'goodness' is the primary sort.
950 // This may have similar secondary sorting to the population ranking, but that is not essential.
951 final Comparator<String> comp = new Comparator<String>()
952 {
953 // This treats missing items from _scorerScores as neutral (goodness 0).
954 public final int compare(final String a, final String b)
955 {
956 final ScoreAndConf sacA = byScorerWeighting.get(a);
957 final ScoreAndConf sacB = byScorerWeighting.get(b);
958 // Primary sort by goodness.
959 final int goodnessA = (sacA == null) ? 0 : ScoreAndConf.computeScorerGoodness(sacA);
960 final int goodnessB = (sacB == null) ? 0 : ScoreAndConf.computeScorerGoodness(sacB);
961 if(goodnessA < goodnessB) { return(-1); }
962 if(goodnessA > goodnessB) { return(+1); }
963 if((sacA != null) && (sacB != null))
964 {
965 // Secondary sort by score (if both scores available).
966 if(sacA.score < sacB.score) { return(-1); }
967 if(sacA.score > sacB.score) { return(+1); }
968 }
969 // Break ties by name to ensure a total ordering.
970 return(a.compareTo(b));
971 }
972 };
973 final List<String> calibSetRankedScorers = new ArrayList<String>(scorerSample);
974 Collections.sort(calibSetRankedScorers, comp);
975 assert(calibSetRankedScorers.size() == scorerSampleSize);
976 // If we get exactly the same ordering then this seems to be a perfect calibration set.
977 if(calibSetRankedScorers.equals(scorerSample))
978 {
979 if(IsDebug.isDebug) { System.out.println("[_computeCalibrationSetError() exact match!]"); }
980 return(0);
981 }
982
983 // Compute the RMS distance of each Scorer from its original ranking.
984 // FIXME: this value can reach scorerSampleSize^4.
985 long sqTotal = 0;
986 for(int i = scorerSampleSize; --i >= 0; )
987 {
988 final String snp = scorerSample.get(i);
989 final int newIndex = calibSetRankedScorers.indexOf(snp);
990 assert(newIndex >= 0); // All re-ranked Scorers must exist in the sample set and vv.
991 // Compute error in ranking (distance between correct and actual index).
992 final int distance = Math.abs(i - newIndex);
993 final int importance = (scorerSampleSize-i);
994 // We include a factor weighting the mispositioning of better Scorers as more serious.
995 if(distance != 0) { sqTotal += distance*distance * importance; }
996 }
997 assert(sqTotal >= 0); // Crude check for overflow.
998 // We adjust the RMS error to be no more than Integer.MAX_VALUE/2.
999 // FIXME: (max) RMS value is currently proportional to sample size^1.5: needs normalising...
1000 final int errorRMS = Math.max(0, (int) Math.round(Math.sqrt(sqTotal / (double) scorerSampleSize)));
1001 if(IsDebug.isDebug) { System.out.println("[_computeCalibrationSetError(): RMS error = "+errorRMS*2+" for Score sample size of "+scorerSampleSize+".]"); }
1002
1003 // The most important feature is that the original 'best' Scorer should still be best or very close.
1004 // This is key to ensuring that this smaller calibration set
1005 // is a reasonable proxy for the full set of input calibration/vote data.
1006 // final boolean bestIsStillBest = bestScorer.equals(calibSetRankedScorers.get(0));
1007 final boolean bestIsStillNearlyBest = calibSetRankedScorers.indexOf(bestScorer) <= (scorerSampleSize >>> 5);
1008 // If the best *is* still (nearly) best, then the calibration error is simply the RMS error.
1009 if(bestIsStillNearlyBest) { return(errorRMS); }
1010 if(IsDebug.isDebug) { System.out.println("[_computeCalibrationSetError(): does not identify 'best' correctly; at position/index "+calibSetRankedScorers.indexOf(bestScorer)+"]"); }
1011 // Else we add on a penalty that is guaranteed to outweigh any RMS error.
1012 return(errorRMS + (Integer.MAX_VALUE >>> 1));
1013 }
1014
1015 /**Private cache for extractCalibrationSet(); never null.
1016 * Thread-safe LRU cache with capped size.
1017 * <p>
1018 * Map from (effectively) basename/size/errors to a calibration set and its time of computation.
1019 * <p>
1020 * The size parameter is the largest (non-negative) power of two no higher than the request size
1021 * (its rounded-down log to the base 2),
1022 * thus mapping arbitrary requested maximum sizes to a smaller number of powers of two.
1023 * (Note that this mapping may be further conflated/condensed to improve the cache effectiveness.)
1024 * <p>
1025 * May be trimmed right down automatically in case of memory shortage.
1026 */
1027 private final MemoryTools.SimpleLRUMapAutoSizeForHitRate<Triple<String,Integer,Boolean>, Pair<Long, Map<Name.ExhibitShort,ScoreAndConf>>> _eCS_cache;
1028
1029 /**Compute exemplar sub-set of exhibits to calibrate Scorers with given base name against; never null but may be empty.
1030 * This implementation extracts the best available Scorer of the given base name (if any)
1031 * and finds some of the exhibits that it best predicts and worst predicts
1032 * and an even sampling of those in between and some random samplings of that median range.
1033 * The result should usually be a result for which the weighting of the selected Scorer
1034 * may be reasonably close to zero or to the mean of its accuracy range.
1035 * <p>
1036 * This may return an empty result if there is not enough data to calibrate against,
1037 * eg human-case votes.
1038 * <p>
1039 * It may be possible to tune or pre-test new Scorers against the results of this
1040 * as a fast filter.
1041 * <p>
1042 * If a base name is specified that is invalid, it is treated as if null.
1043 * <p>
1044 * We may cache generated results for a short while since this extraction may be expensive,
1045 * though the cache time is of the order of hours rather than the day or so elsewhere.
1046 *
1047 * @param baseName base name of Scorer to extract calibration set for,
1048 * or null for a generic all-Scorers calibration set
1049 * @param maxSamples the maximum number of samples to return; strictly positive
1050 * @param difficult if TRUE the return the difficult cases that we do not predict well,
1051 * if FALSE then return the easy cases that we predict well,
1052 * else return a mixture of good, bad, and other random cases
1053 * @param allowStale if true then allow slightly older data for speed and robustness
1054 *
1055 * @return map of zero-or-more exhibits (short names) to calibration-data values; non-null
1056 */
1057 public Map<ExhibitShort, ScoreAndConf> extractCalibrationSet(final String baseName,
1058 final int maxSamples,
1059 final Boolean difficult,
1060 final boolean allowStale)
1061 throws IOException
1062 {
1063 if(maxSamples < 1) { throw new IllegalArgumentException(); }
1064
1065 // Capture the start time early for a conservative cache timestamp.
1066 final long startTime = System.currentTimeMillis();
1067 final Long stL = new Long(startTime);
1068
1069 // Get the voted-for exhibits, from which we can draw our calibration set.
1070 // Iff the system is stressed then work from a pre-trimmed sub-set,
1071 // but otherwise use everything available so as to maximise our view and coverage.
1072 final boolean trim = ((!MemoryTools.lotsFree()) || GenUtils.mustConservePower());
1073 final Map<Name.ExhibitShort, ScoreAndConf> votes = getVotedForExhibitsAndVoteFactors(allowStale, trim);
1074 final int vfSize = votes.size();
1075 // Return an empty result immediately (and don't cache it) if there are no votes to calibrate against.
1076 if(vfSize == 0) { return(Collections.emptyMap()); }
1077
1078 // Get best Scorers with given base name, if any, best first.
1079 final List<String> breedingSet = ((baseName == null) || !getBaseScorersWithoutParameters().contains(baseName)) ?
1080 null : population.getBreedingSet(baseName);
1081 // The effective base name being used, ie null if "as-if" null.
1082 final boolean noBreedingSet = (breedingSet == null) || breedingSet.isEmpty();
1083 final String effectiveBaseName = noBreedingSet ? null : baseName;
1084 final List<String> effectiveBreedingSet = noBreedingSet ? null : breedingSet;
1085
1086 // If the set of voted-for exhibits is no larger than the maximum samples requested
1087 // and no basename is given (ie is null) or no suitable Scorers exist for that basename
1088 // and a mixture of difficult and easy cases has been requested
1089 // then return the full voted-for-exhibits list as-is for speed.
1090 if(noBreedingSet && (vfSize <= maxSamples) && !(difficult instanceof Boolean))
1091 { return(votes); }
1092
1093 // We cannot ever generate a calibration set larger than the number of voted-for exhibits.
1094 final int cappedMaxSamples = Math.min(maxSamples, vfSize);
1095 assert(cappedMaxSamples > 0);
1096 // We'll round this to the nearest power of two not larger than this maximum.
1097 // And this log value is part of our key for efficient/dense representation.
1098 final int cMSLog = GenUtils.log2Approx(cappedMaxSamples);
1099 // Expand back to an actual size limit.
1100 final int roundedMaxSamples = 1 << cMSLog;
1101 assert(roundedMaxSamples <= cappedMaxSamples); // No larger than the original request.
1102 assert(roundedMaxSamples >= cappedMaxSamples/2); // Not less than half the original request size.
1103
1104 // Compute the cache key.
1105 final Triple<String, Integer, Boolean> cacheKey = new Triple<String, Integer, Boolean>(effectiveBaseName, cMSLog, difficult);
1106 // Look up in cache...
1107 final Pair<Long, Map<Name.ExhibitShort, ScoreAndConf>> cachedValue = _eCS_cache.get(cacheKey);
1108 // If we have a cached result, and it is not too old, then return it immediately.
1109 if(cachedValue != null)
1110 {
1111 // If not allowing stale values then we insist on a value from the same LONG 'tick'
1112 // which should force also recomputation when our larger VLONG-timed caches get recomputed,
1113 // else we allow a longer grace period.
1114 final long intervalNumber = EventPeriod.LONG.getIntervalNumber(startTime);
1115 final long intervalNumberCached = EventPeriod.LONG.getIntervalNumber(cachedValue.first);
1116 if(intervalNumber == intervalNumberCached)
1117 { return(cachedValue.second); }
1118 // Allow a couple of ticks for a stale value...
1119 if(allowStale && (intervalNumber - intervalNumberCached < 2))
1120 { return(cachedValue.second); }
1121 // Fall through to recompute the value...
1122 }
1123
1124 // If no baseName or no breeding set for the specified base name
1125 // then generate the generic calibration set.
1126 // If we do have a breeding set, etc, select the best item from the breeding set.
1127 final String bestOfBreed = ((breedingSet == null) || breedingSet.isEmpty()) ? null :
1128 breedingSet.get(0);
1129 final boolean noBoB = (bestOfBreed == null);
1130 final ScorerIF scBoB = noBoB ? null : population.getScorerInstance(bestOfBreed);
1131
1132 if(IsDebug.isDebug) { System.out.println("ScorerCacheImpl.extractCalibrationSet(): bestOfBreed="+bestOfBreed+" for baseName="+baseName); }
1133
1134 // Set of exhibits (short names) that we will return data for.
1135 final Set<Name.ExhibitShort> exhibits = new HashSet<Name.ExhibitShort>(2*roundedMaxSamples);
1136
1137 // If there are no more voted-for exhibits than the max samples requested,
1138 // then we can just return all available voted-for exhibits
1139 // and we need not do any complex/tricky/expensive selection at all.
1140 if(roundedMaxSamples >= votes.size())
1141 { exhibits.addAll(votes.keySet()); }
1142 else
1143 {
1144 // Select example exhibits from the full/available set.
1145
1146 // We need to be able to convert short names to long names...
1147 final AllExhibitImmutableData aeid = dataSource.getAllExhibitImmutableData(-1);
1148
1149 // Get composite results for all voted-for exhibits
1150 // ranked by ascending closeness to the right answer (descending error*confidence).
1151 // Higher closeness implies successful prediction,
1152 // so the outliers are our hard/easy bad/good samples.
1153 // However, we omit exhibits with zero confidence and zero scores
1154 // as probably indicating that we may simply not have any usable Scorers at all for them.
1155 // Allow this to swallow some I/O and calculation problems
1156 // (not enough to make a significant statistical difference,
1157 // but enough to ride out problems with individual exhibits, network glitches, etc).
1158 final int maxErrors = Math.max(1, roundedMaxSamples >>> 6); // ~1% errors will be ignored.
1159 // Error count during computation.
1160 int errorCount = 0;
1161 final Map<Name.ExhibitShort, ScoreAndConf> closenessScores = new HashMap<Name.ExhibitShort, ScoreAndConf>(2*vfSize);
1162 // We evaluate the exhibits in random order to eliminate order-based biases.
1163 final List<Name.ExhibitShort> shuffledExhibits = new ArrayList<Name.ExhibitShort>(votes.keySet());
1164 Collections.shuffle(shuffledExhibits, new Random(Rnd.fastRnd.nextLong()));
1165 for(final Name.ExhibitShort n : shuffledExhibits)
1166 {
1167 final Name.ExhibitFull fullName = aeid.getFullName(n);
1168 if(fullName == null) { continue; }
1169
1170 final ScoreAndConf sac;
1171 try
1172 {
1173 sac = noBoB ?
1174 computeCompositeScoreAndConfidence(fullName, allowStale) :
1175 computeUnweightedScoreAndConfidence(fullName, scBoB, allowStale);
1176 }
1177 catch(final IOException e)
1178 {
1179 // If error count gets too high then abort...
1180 if(++errorCount > maxErrors) { throw e; }
1181 // ...else just skip this exhibit.
1182 else { continue; }
1183 }
1184 // Skip this exhibit if it cannot be scored at all (eg wrong type).
1185 if((sac.confidence == 0) && (sac.score == 0)) { continue; }
1186 final ScoreAndConf voteSac = votes.get(n);
1187 // The error [0,+MAX] is (half) the absolute difference
1188 // between the actual vote and the predicted score
1189 // multiplied by the cross-confidence (normalised)
1190 // so that lower-confidence pairs are weighted more neutrally.
1191 // The closeness/score is MAX - error.
1192 // The confidence is the cross-confidence.
1193 // These metrics are chosen to work with composite and unweighted scores.
1194 final int crossConfidence = sac.confidence * voteSac.confidence;
1195 final long error = (Math.abs(sac.score - voteSac.score) * (long)crossConfidence)
1196 / (2L * ScoreAndConf.MAX * ScoreAndConf.MAX);
1197 assert((error >= 0) && (error <= ScoreAndConf.MAX));
1198 final ScoreAndConf closeness = new ScoreAndConf(ScoreAndConf.MAX - (int)error, crossConfidence/ScoreAndConf.MAX);
1199 closenessScores.put(n, closeness);
1200 }
1201 // Return an empty result immediately (and don't cache it) if there are no scores to calibrate against.
1202 if(closenessScores.isEmpty()) { return(Collections.emptyMap()); }
1203 // Now compute closeness-sorted (lowest-first) results by exhibit.
1204 final List<Name.ExhibitShort> byCloseness = new ArrayList<Name.ExhibitShort>(closenessScores.keySet());
1205 Collections.sort(byCloseness, new Comparator<Name.ExhibitShort>() {
1206 public final int compare(final Name.ExhibitShort n1, final Name.ExhibitShort n2)
1207 {
1208 final ScoreAndConf sac1 = closenessScores.get(n1);
1209 final ScoreAndConf sac2 = closenessScores.get(n2);
1210 // Primary sort by closeness (score).
1211 if(sac1.score < sac2.score) { return(-1); }
1212 if(sac1.score > sac2.score) { return(+1); }
1213 // Secondary sort (tie breaking) by confidence.
1214 if(sac1.confidence < sac2.confidence) { return(-1); }
1215 if(sac1.confidence > sac2.confidence) { return(+1); }
1216 // Treat as identical.
1217 return(0);
1218 }
1219 });
1220 assert(!byCloseness.isEmpty());
1221 assert(byCloseness.size() == closenessScores.size());
1222 final int bCSize = byCloseness.size();
1223 final int targetResultSize = Math.min(roundedMaxSamples, bCSize);
1224 assert((bCSize == 0) || (closenessScores.get(byCloseness.get(0)).score <= closenessScores.get(byCloseness.get(bCSize-1)).score));
1225 // The number of outliers we will use depends on the "difficult" parameter.
1226 // If a mixture of values has been requested then we include the 25% best and 25% worst.
1227 // Don't have too many extreme cases: "Hard cases make bad law."
1228 final int nOutliersUsed = (difficult == null) ? (targetResultSize >>> 2) : targetResultSize;
1229 // UNLESS only easy cases were were requested THEN add in the most difficult (lowest closeness) cases.
1230 if(!Boolean.FALSE.equals(difficult))
1231 { exhibits.addAll(byCloseness.subList(0, nOutliersUsed)); }
1232 // UNLESS only difficult cases requested THEN add in the easiest (highest closeness) cases.
1233 if(!Boolean.TRUE.equals(difficult))
1234 { exhibits.addAll(byCloseness.subList(bCSize-nOutliersUsed, bCSize)); }
1235
1236 // Top up result set to requested size with intermediate and random samples
1237 // when a mix of results has been requested.
1238 if(difficult == null)
1239 {
1240 // Create a view of the median gap from which we have no samples yet.
1241 final List<Name.ExhibitShort> median = byCloseness.subList(nOutliersUsed, bCSize-nOutliersUsed);
1242 final int medianSize = median.size();
1243
1244 // Inject some representative roughly-evenly-spaced samples
1245 // (roughly half of the remaining free slots in the result).
1246 // There is likely to be some bias given the integer arithmetic involved.
1247 final int medianRegularSampleCount = Math.min(medianSize, targetResultSize-exhibits.size()) / 2;
1248 if(medianRegularSampleCount > 0)
1249 {
1250 final int stepSize = medianSize / medianRegularSampleCount;
1251 for(int i = (stepSize+1)/2; (i < medianSize) && (exhibits.size() < targetResultSize); i += stepSize)
1252 { exhibits.add(median.get(i)); }
1253 }
1254
1255 // Retain some expensive-to-compute values
1256 // between evaluations of our putative calibration sets.
1257 final ConcurrentHashMap<String, ConcurrentMap<Name.ExhibitShort, ScoreAndConf>> workspace = new ConcurrentHashMap<String, ConcurrentMap<Name.ExhibitShort,ScoreAndConf>>();
1258
1259 // Find a selection of random samples that gives the best results,
1260 // by measuring the result calibration set error with different values.
1261 int error = Integer.MAX_VALUE;
1262 Set<Name.ExhibitShort> bestResult = null;
1263 for(int i = MAX_ECS_ITERATIONS; --i >= 0; )
1264 {
1265 final Set<Name.ExhibitShort> putativeCalibSet = new HashSet<Name.ExhibitShort>(2 * targetResultSize);
1266 putativeCalibSet.addAll(exhibits);
1267
1268 // Top-up with random samples from the median range.
1269 // Only samples from the median range are potentially absent from the result so far,
1270 // so that is the only place that we look for (new) random samples from.
1271 // This may be slow if the desired result size is very close (below) the source size.
1272 // So we limit the effort that we expend, and use a faster generator in that case;
1273 // normally we use a good generator to keep our result as random as possible.
1274 final Random r = (targetResultSize < medianSize/2) ? Rnd.goodRnd : Rnd.fastRnd;
1275 for(int j = 2*medianSize; (putativeCalibSet.size() < targetResultSize) && (--j >= 0); )
1276 { putativeCalibSet.add(byCloseness.get(r.nextInt(medianSize))); }
1277
1278 // Compute the 'error' in this putative calibration set.
1279 final Map<Name.ExhibitShort, ScoreAndConf> calibMap = new HashMap<Name.ExhibitShort, ScoreAndConf>(votes);
1280 assert(!calibMap.isEmpty());
1281 assert(!putativeCalibSet.isEmpty());
1282 calibMap.keySet().retainAll(putativeCalibSet);
1283 assert(!calibMap.isEmpty()) : "putativeCalibSet = " + new ArrayList<Name.ExhibitShort>(putativeCalibSet);
1284 final int pErr = _computeCalibrationSetError(calibMap, effectiveBreedingSet, workspace);
1285 if(IsDebug.isDebug) { System.out.println("[ScorerCacheImpl.extractCalibrationSet(): calibration error = "+pErr+" (attempts remaining "+i+").]"); }
1286 assert(pErr >= 0);
1287 // If this is the best so far then record it.
1288 if(pErr < error)
1289 {
1290 error = pErr;
1291 bestResult = putativeCalibSet;
1292 // If the calibration error is zero then we can stop immediately.
1293 if(pErr == 0) { break; }
1294 }
1295 }
1296
1297 // If we found a good result then use it.
1298 if(bestResult != null) { exhibits.clear(); exhibits.addAll(bestResult); }
1299 }
1300 //if(IsDebug.isDebug) { System.out.println("[ScorerCacheImpl.extractCalibrationSet(): vfSize/maxSamples/bCSize/exhibits.size()="+vfSize+'/'+maxSamples+'/'+bCSize+'/'+exhibits.size()+".]"); }
1301 }
1302
1303 // Compute the result map.
1304 final Map<Name.ExhibitShort, ScoreAndConf> r = new HashMap<Name.ExhibitShort, ScoreAndConf>(2*exhibits.size());
1305 // Copy in the vote factor/score for each exemplar exhibit.
1306 for(final Name.ExhibitShort n : exhibits)
1307 { r.put(n, votes.get(n)); }
1308
1309 // Compute the goodness of our model Scorer against this calibration set.
1310 if(bestOfBreed != null)
1311 {
1312 final AllExhibitImmutableData aeid = dataSource.getAllExhibitImmutableData(-1);
1313 final Pair<ScoreAndConf, Boolean> weighting = ScorerCreator.computeWeighting(r, this, population.getScorerInstance(bestOfBreed), aeid, allowStale);
1314 if(IsDebug.isDebug) { System.out.println("$$$ bestOfBreed SAC and goodness against calibration set: " + weighting.first + ", goodness=" + ScoreAndConf.computeScorerGoodness(weighting.first)); }
1315 }
1316
1317 // Wrap result for immutability and cache it.
1318 final Map<Name.ExhibitShort, ScoreAndConf> result = Collections.unmodifiableMap(r);
1319 _eCS_cache.put(cacheKey, new Pair<Long, Map<Name.ExhibitShort, ScoreAndConf>>(stL, result));
1320
1321 return(result);
1322 }
1323
1324
1325 /**Scorer work object; never null. */
1326 private final ScorerCreator.ScorerWork scorerWork;
1327
1328 /**Expected approximate poll cycle time (ms); strictly positive. */
1329 private static final int APPROX_POLL_CYCLE_MS = 1000;
1330
1331 /**Do some 'evolution' in a background thread if possible.
1332 * If we have too much background activity going on this will do nothing.
1333 * <p>
1334 * This routine always returns quickly.
1335 * <p>
1336 * This should never throw any exceptions.
1337 *
1338 * @param minimal if true, run as short a time as possible to make some progress
1339 */
1340 private void evolve(final boolean minimal)
1341 {
1342 ThreadUtils.lowPriorityThreadPoolDiscardable.submit(new Runnable(){
1343 public final void run()
1344 {
1345 // Set a time to aim to stop processing by on this run.
1346 //
1347 // This is usually greater than the expected cycle time
1348 // to try to enable multiple threads to run in parallel on a multi-CPU server
1349 // (at minimum priority to be friendly to other processing)
1350 // up to the limit imposed by the thread pool.
1351 // However, on a single-CPU box (TODO or unless we are very lightly loaded)
1352 // then we run for much less than the expected cycle time
1353 // to try to avoid hogging all the CPU, especially on a single-CPU server.
1354 final int availableProcessors = Runtime.getRuntime().availableProcessors();
1355 // Set a very low minimum time in 'minimal' mode,
1356 // ie be prepared to stop immediately when we make some measurable progress.
1357 final long minEndTime = System.currentTimeMillis() + 1 + (minimal ? 0 : (APPROX_POLL_CYCLE_MS/8));
1358 // Ensure that the end time is always after a minimum end time...
1359 final long endTime = minEndTime +
1360 ((!minimal && !GenUtils.mustConservePower() && (availableProcessors > 1)) ?
1361 (APPROX_POLL_CYCLE_MS*(2+GenUtils.log2Approx(availableProcessors))) :
1362 (APPROX_POLL_CYCLE_MS/4));
1363
1364 // Do one chunk of work...
1365 try { scorerWork.doChunk(endTime, minEndTime, Rnd.fastRnd.nextBoolean(), inboundScorerQueue); }
1366 catch(final Throwable t)
1367 {
1368 // Note any unexpected errors that get this far...
1369 log.log("ScorerCacheImpl.poll(): unexpected exception "+t.getClass().getName()+": "+t.getMessage());
1370 t.printStackTrace();
1371 }
1372 }
1373 });
1374
1375 }
1376
1377 /**Called/polled periodically (of the order of 1Hz) to do donkey-work and background tasks.
1378 * In particular, this call drives the search for improved Scorers,
1379 * as well as performing housekeeping and maintaining caches to speed foreground tasks.
1380 * <p>
1381 * This launches its work in a low-priority daemon thread,
1382 * and limits the number of such concurrent work threads globally
1383 * by silently discarding any excess,
1384 * ie this call always returns quickly.
1385 * <p>
1386 * This routine should not be called (often) if the host system is under heavy load.
1387 */
1388 public final void poll()
1389 throws IOException
1390 {
1391 // Allow a big timeslice if not conserving power and got lots of free memory...
1392 evolve(GenUtils.mustConservePower() || !MemoryTools.lotsFree());
1393 }
1394
1395
1396 /**Thread-safe, bounded size, queue for inbound Scorers; never null.
1397 * The upper bound on size should usually be set to be a small fraction of population size
1398 * on the grounds that we are unlikely to be able to quickly accept a large volume of new entrants
1399 * in any sensible way.
1400 */
1401 private final ArrayBlockingQueue<String> inboundScorerQueue;
1402
1403 /**Attempt to queue an externally-supplied Scorer value; returns true if accepted.
1404 * This attempts to save the value supplied in our internal bounded-size queue.
1405 * <p>
1406 * Note that this accepts (and discards) anything which is definitely never usable,
1407 * eg syntactically invalid or not for a valid Scorer base name,
1408 * so as to help conserve resources such as memory and queue space.
1409 * <p>
1410 * This routine should always be relatively quick/efficient;
1411 * much more work is performed when an item is dequeued,
1412 * presumably when we know that we have ample resources available.
1413 * <p>
1414 * This may try to start some work immediately if the queue is now getting full.
1415 * <p>
1416 * Should not throw any exceptions; invalid Scorer values are simply discarded with true returned.
1417 */
1418 @Override
1419 public boolean offerExternalScorer(final String externalScorerNameAndParameters)
1420 {
1421 // If the queue is full then indicate that we cannot accept the Scorer value.
1422 if(inboundScorerQueue.remainingCapacity() == 0) { return(false); }
1423 // Validate and queue the value if possible.
1424 try
1425 {
1426 // Check that the Scorer value is parseable.
1427 final Pair<String, Map<String, String>> nap = AbstractScorer.parseNameAndParameters(externalScorerNameAndParameters);
1428 // Immediately absorb/discard unparsable Scorer or with invalid base name.
1429 if(!fixedSimpleScorers.containsKey(nap.first)) { return(true); }
1430 // Canonicalise (and hopefully minimise the size of) the Scorer; discard value if not possible.
1431 final ScorerIF sc = getScorerInstance(externalScorerNameAndParameters);
1432 if(sc == null) { return(true); }
1433 final String canon = AbstractScorer.canonicalise(sc);
1434 // Discard if already known to us at all, even as a stale value.
1435 if(null != population.getScorerWeighting(canon, true)) { return(true); }
1436 // Now try to queue the value and report if we were successful.
1437 final boolean result = inboundScorerQueue.offer(canon);
1438 // If getting full or already so then try to start a little work right now
1439 // (in a low-priority async thread) in the hope of catching up ASAP,
1440 // though not if already horribly stressed or busy.
1441 if(!canAcceptMoreExternalScorers() && !MemoryTools.isMemoryStressed())
1442 { evolve(GenUtils.mustConservePower()); }
1443 // Tell the caller whether their item was queued successfully.
1444 return(result);
1445 }
1446 // Absorb/discard unparsable values immediately.
1447 catch(final IllegalArgumentException e) { return(true); }
1448 }
1449
1450 /**Returns true if this cache can definitely accept (many) more externally-supplied Scorer values.
1451 * Even if this returns false we may in practice be able to accept
1452 * one or more new values: this is indicative.
1453 * <p>
1454 * True if our internal queue is at most about half full,
1455 * but not any sort of guarantee that we can actually accept another Scorer.
1456 * <p>
1457 * In power-conserving mode only welcome new entries if the queue is currently empty,
1458 * ie if we seem to be keeping up with any data coming in.
1459 * This is important because this call can be used to decide
1460 * when to hand out new work to clients,
1461 * from which the results will arrive only much later.
1462 */
1463 @Override
1464 public boolean canAcceptMoreExternalScorers()
1465 {
1466 if(GenUtils.mustConservePower()) { return(inboundScorerQueue.isEmpty()); }
1467 return(inboundScorerQueue.size() < inboundScorerQueue.remainingCapacity());
1468 }
1469
1470 /**Returns true if at least once external Scorer is queued waiting to be processed. */
1471 @Override
1472 public boolean hasQueuedExternalScorer() { return(!inboundScorerQueue.isEmpty()); }
1473 }