001 /*
002 Copyright (c) 1996-2012, Damon Hart-Davis
003 All rights reserved.
004
005 Redistribution and use in source and binary forms, with or without
006 modification, are permitted provided that the following conditions are
007 met:
008
009 * Redistributions of source code must retain the above copyright
010 notice, this list of conditions and the following disclaimer.
011
012 * Redistributions in binary form must reproduce the above copyright
013 notice, this list of conditions and the following disclaimer in the
014 documentation and/or other materials provided with the
015 distribution.
016
017 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
018 IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
019 TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
020 PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
021 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
022 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
023 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
024 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
025 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
026 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
027 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
028 */
029
030 package org.hd.d.pg2k.ai.scorer;
031
032 import java.io.IOException;
033 import java.io.InterruptedIOException;
034 import java.util.ArrayList;
035 import java.util.Collection;
036 import java.util.Collections;
037 import java.util.Comparator;
038 import java.util.HashMap;
039 import java.util.HashSet;
040 import java.util.Iterator;
041 import java.util.List;
042 import java.util.Map;
043 import java.util.Queue;
044 import java.util.Set;
045 import java.util.concurrent.ConcurrentHashMap;
046 import java.util.concurrent.ConcurrentMap;
047 import java.util.concurrent.atomic.AtomicBoolean;
048 import java.util.concurrent.atomic.AtomicInteger;
049 import java.util.concurrent.atomic.AtomicLong;
050
051 import org.freehep.math.minuit.FCNBase;
052 import org.freehep.math.minuit.FunctionMinimum;
053 import org.freehep.math.minuit.MnMinimize;
054 import org.freehep.math.minuit.MnStrategy;
055 import org.freehep.math.minuit.MnUserParameters;
056 import org.hd.d.pg2k.svrCore.AllExhibitImmutableData;
057 import org.hd.d.pg2k.svrCore.CoreConsts;
058 import org.hd.d.pg2k.svrCore.GenUtils;
059 import org.hd.d.pg2k.svrCore.MemoryTools;
060 import org.hd.d.pg2k.svrCore.Name;
061 import org.hd.d.pg2k.svrCore.Rnd;
062 import org.hd.d.pg2k.svrCore.SimpleLoggerIF;
063 import org.hd.d.pg2k.svrCore.Tuple;
064 import org.hd.d.pg2k.svrCore.datasource.SimpleExhibitPipelineIF;
065 import org.hd.d.pg2k.svrCore.vars.EventPeriod;
066 import org.hd.d.pg2k.svrCore.vars.EventVariableValue;
067 import org.hd.d.pg2k.svrCore.vars.SimpleVariableDefinition;
068 import org.hd.d.pg2k.svrCore.vars.SystemVariables;
069
070 import ORG.hd.d.IsDebug;
071
072
073 /**Creates new Scorer (parameter-set) instances given an existing population.
074 *
075 * @author dhd
076 */
077 public final class ScorerCreator
078 {
079 /**Scorer 'source' for recovering from persistent/shared store; never empty nor null. */
080 public static final String SSOURCE_REC = "REC";
081
082 /**This method attempts to retrieve a "new" Scorer from the persisted history.
083 * This mechanism is used to recover previous known-good Scorers from local store and/or
084 * share good Scorers between all participating servers.
085 * This makes for faster start-up of new server instances,
086 * a local and global memory of good genotypes,
087 * and sharing of good genotypes between local populations.
088 *
089 * @return true if a Scorer was found that was not in the extant population
090 * (though may not have been good enough to remain in it)
091 */
092 public static boolean retrieveOldScorer(final SimpleExhibitPipelineIF dataSource,
093 final ScorerCacheIF cache,
094 final ScorerPopulation population,
095 final SimpleLoggerIF log)
096 throws IOException
097 {
098 // Choose between the longest and a less-long period
099 // to try to pick up more variety of persisted Scorer.
100 final EventPeriod intervalSelector = Rnd.fastRnd.nextBoolean() ?
101 EventPeriod.LONG : EventPeriod.VLONG;
102 // Choose a period from the previous full period to as-far-back-as-possible
103 // (for the given period/interval length)
104 // to improve long-term memory of Scorer values tests
105 // and to increase the chance of finding something rare,
106 // but with the choice heavily skewed to more recent events
107 // as more likely to be "better" or even present at all.
108 final long whichPeriod = intervalSelector.getIntervalNumber(System.currentTimeMillis()) -
109 Rnd.goodRnd.nextInt(1+Rnd.goodRnd.nextInt(1+Rnd.goodRnd.nextInt(SystemVariables.EVENT_SAMPLES_RETAINED)));
110 // Select from local (larger volume) and global (broadcast) results.
111 final SimpleVariableDefinition def = Rnd.fastRnd.nextBoolean() ?
112 SystemVariables.AI_SCORER_STRING_LOCAL_EVENT : SystemVariables.AI_SCORER_STRING_GLOBAL_EVENT;
113 final EventVariableValue[] evvs = dataSource.getEventValues(def, intervalSelector, whichPeriod, null);
114 if(evvs.length == 0) { return(false); }
115 EventVariableValue evv = evvs[0];
116 // If the selected (full, historical) period has no events
117 // then try the current (partial) period instead,
118 // which has the advantage of getting us bang up-to-date.
119 if((evv == null) || (evv.getTotalDistinctValues() == 0))
120 { evv = dataSource.getEventValue(def, intervalSelector, true); }
121 if((evv == null) || (evv.getTotalDistinctValues() == 0)) { return(false); }
122 // Select an entry at random, though weighted more towards items that were "best" more often.
123 final String scorerNameAndStringParameters = (String) evv.getValueByRank(Rnd.goodRnd.nextInt(1+Rnd.goodRnd.nextInt(evv.getTotalDistinctValues())));
124 // Check that it is new...
125 if(population.getScorerWeighting(scorerNameAndStringParameters, true) != null) { return(false); }
126 // Evaluate/record/cache it if possible.
127 final ScoreAndConf sac = cache.computeScorerWeighting(scorerNameAndStringParameters, false, SSOURCE_REC);
128 // Check that it is good enough to remain in the population...
129 if(population.getScorerWeighting(scorerNameAndStringParameters, false) != null) { return(false); }
130 if(IsDebug.isDebug) { log.log("[ScorerCreator.retrieveOldScorer(): imported "+scorerNameAndStringParameters+" ScoreAndConfidence: "+sac+".]"); }
131 return(true); // Found (and imported/evaluated) a new Scorer not in our population.
132 }
133
134 /**Scorer 'source' for the GA combine/mutate method; never empty nor null. */
135 public static final String SSOURCE_GA = "GA";
136
137 /**Create a new Scorer (parameter set) from existing ones using normal GA techniques.
138 * This uses selection, mutation and recombination to attempt to find
139 * a "better" Scorer version based on those in the existing population.
140 *
141 * @param population the extant population; never null
142 *
143 * @return true iff we created and evaluated a new Scorer not in the extant population
144 */
145 public static boolean createNewByGA(final ScorerCacheIF cache,
146 final ScorerPopulation population,
147 final SimpleLoggerIF log)
148 throws IOException
149 {
150 // Pick breeding set to work on.
151 // Allow slightly stale values for speed and robustness.
152 final List<String> breedingSet = pickBreedingSet(cache, population, true);
153 if(breedingSet == null) { return(false); /* No suitable breeding stock... */ }
154
155 // Randomly select one or more different Scorers to mutate and combine.
156 final Set<String> parents = new HashSet<String>();
157 do
158 {
159 // Randomly select a Scorer, skewed towards "better" Scorers.
160 final String extant = breedingSet.get(
161 Rnd.goodRnd.nextInt(1 + Rnd.goodRnd.nextInt(breedingSet.size())));
162 // Add this mutant if different to extant parents (eg if no parents so far).
163 parents.add(extant);
164 // Stop if this parent has zero or one parameters,
165 // since multiple parents cannot be usefully "combined" sexually.
166 if(extant.indexOf(AbstractScorer.SEPARATOR) == extant.lastIndexOf(AbstractScorer.SEPARATOR))
167 { break; }
168 // We try not to draw too many samples from a small breeding set
169 // and we keep low the probability of using many parents.
170 } while((parents.size() < breedingSet.size()/4) && Rnd.goodRnd.nextBoolean());
171 assert(!parents.isEmpty());
172
173 final ScorerIF firstParent = cache.getScorerInstance(parents.iterator().next());
174
175 // Now make the new new variant/child by combining the parents if more than one
176 // and mutating the result.
177 // Else if there is exactly one parent then mutate it to get the new child.
178 final ScorerIF newVariant;
179 if(parents.size() == 1)
180 { newVariant = firstParent.createPerturbedVariant(); }
181 else
182 {
183 // Combine the parents' parameters
184 // and mutate the result/offspring.
185 final ArrayList<List<ScorerParam>> pt = new ArrayList<List<ScorerParam>>(parents.size());
186 for(final String snp : parents)
187 { pt.add(population.getScorerInstance(snp).getParameterDefsAndValues()); }
188 // Shuffle once to get a completely random order.
189 Collections.shuffle(pt, Rnd.goodRnd);
190 // Now, add one token at a time from the first parent,
191 // occasionally shuffling to switch the source of tokens
192 // to be from a different parent.
193 // We stop when the current/first parent is out of tokens.
194 // We force at least one crossover.
195 final int pt0len = pt.get(0).size();
196 final List<ScorerParam> newParams = new ArrayList<ScorerParam>(pt0len);
197 final int forcedCrossoverAfter = (pt0len <= 3) ? 1 : (1+Rnd.goodRnd.nextInt(pt0len-2));
198 // Note that each time, for whatever the current "0" Scorer is,
199 // we test against its length to safely allow for variable parameter numbers.
200 for(int tokenNumber = 0; tokenNumber < pt.get(0).size(); ++tokenNumber)
201 {
202 newParams.add(pt.get(0).get(tokenNumber));
203 // Occasionally switch to another parent.
204 // We make it quite unlikely to switch/crossover at any one point
205 // so as to give some notion of locality/clustering.
206 // FIXME: The frequency/probability of crossover may be very important.
207 if((forcedCrossoverAfter == tokenNumber) ||
208 (Rnd.fastRnd.nextBoolean() && Rnd.goodRnd.nextBoolean()))
209 {
210 final int ptSize = pt.size();
211 // Swap the current 0 item (which is the source of "genes")
212 // randomly with one of the others.
213 // This is analogous to a DNA crossover during meiosis.
214 final int swapIndex = (ptSize == 2) ? 1 : 1+Rnd.goodRnd.nextInt(ptSize-1);
215 final List<ScorerParam> old0 = pt.get(0);
216 pt.set(0, pt.get(swapIndex));
217 pt.set(swapIndex, old0);
218 }
219 }
220 // Get the un-mutated child.
221 final ScorerIF child = firstParent.createVariant(firstParent.getBaseName(), newParams);
222 final String childSNP = child.getNameAndParameters();
223 // Always mutate the child if identical to one parent
224 // or already exists in the population.
225 newVariant = (!parents.contains(childSNP) && (population.getScorerWeighting(child, true) == null)) ?
226 child /* Already new. */ :
227 child.createPerturbedVariant();
228 }
229
230 // Check that this is new (or at least not rescored recently).
231 if(population.getScorerWeighting(newVariant, true) != null) { return(false); }
232
233 //if(IsDebug.isDebug) { log.log("[ScorerCreator.createNewByGA(): trying new variant: " + newVariant + " of " + (new ArrayList<String>(parents)) + "...]"); }
234
235 // Evaluate new variant, automatically recording/sharing if good.
236 /* final ScoreAndConf sac = */ cache.computeScorerWeighting(newVariant, false, SSOURCE_GA);
237 // Check that it is good enough to remain in the population...
238 if(population.getScorerWeighting(newVariant, false) != null) { return(false); }
239 //if(IsDebug.isDebug) { log.log("[ScorerCreator.createNewByGA(): ...variant ScoreAndConfidence: " + sac + ".]"); }
240 return(true); // Successfully created and evaluated a new Scorer.
241 }
242
243
244 /**Create a variant Scorer given an extant same-type Scorer and the array of values of the variable parameters; never null.
245 */
246 private static ScorerIF constructModifiedSNP(final double[] par, final ScorerIF initialScorer, final List<ScorerParam> paramDefsAndValuesOriginal, final int[] mnIndexToParamIndex)
247 {
248 if((par == null) || (initialScorer == null) ||
249 (paramDefsAndValuesOriginal == null) || (mnIndexToParamIndex == null))
250 { throw new IllegalArgumentException(); }
251 // Copy the original parameter values,
252 // overwriting the changed variable parameters under control of the minimiser.
253 final List<ScorerParam> params = new ArrayList<ScorerParam>(paramDefsAndValuesOriginal);
254 for(int i = par.length; --i >= 0; )
255 {
256 final int origParamIndex = mnIndexToParamIndex[i];
257 final ScorerParam p = paramDefsAndValuesOriginal.get(origParamIndex);
258 if(p instanceof ScorerParamInteger)
259 {
260 final ScorerParamInteger pi = (ScorerParamInteger) p;
261 final int newValue = Math.min(pi.max, Math.max(pi.min, (int) Math.round(par[i])));
262 // Replace the parameter in situ if the value has changed from the original.
263 if(pi.value != newValue)
264 { params.set(origParamIndex, ScorerParamInteger.createScorerParamInteger(pi.min, pi.def, pi.max, pi.delta, pi.biasedLow, pi.name, newValue)); }
265 continue;
266 }
267 assert(false); // Unexpected parameter type has been varied.
268 }
269
270 // Now create the full Scorer name with parameters.
271 return(initialScorer.createVariant(initialScorer.getBaseName(), params));
272 }
273
274 // /**If true then a minimiser run that finds a new/'best' Scorer is allowed to run to completion.
275 // * This may make worker threads slow to die, but they should be daemons and interruptable anyway.
276 // */
277 // private static final boolean ALLOW_MIN_RUN_TO_COMPLETION_AFTER_BEST = true;
278
279 /**Scorer 'source' for the "minimiser" method; never empty nor null. */
280 public static final String SSOURCE_MIN = "MIN";
281
282 /**Create a new Scorer (parameter set) from existing ones using a multi-variable "minimiser".
283 * This selects a "good" Scorer and then attempts to find a better/optimal one
284 * derived from it using mathematical error-minimisation techniques.
285 * <p>
286 * For the moment this only attempts to adjust the continuously-variable properties
287 * (ie the integer parameter values) and leaves the enumeration parameters alone.
288 *
289 * @param population the extant population; never null
290 * @param calibSubset if non-null, should be non-empty and is a calibration set
291 * to optimise/minimise against
292 * @param endTime target end time to stop computations at
293 *
294 * @return true iff we created and evaluated a new Scorer not in the extant population
295 */
296 public static boolean createNewByOpt(final SimpleExhibitPipelineIF dataSource,
297 final ScorerCacheIF cache,
298 final ScorerPopulation population,
299 final Map<Name.ExhibitShort, ScoreAndConf> calibSubset,
300 final SimpleLoggerIF log,
301 final long endTime)
302 throws IOException
303 {
304 if((dataSource == null) ||
305 (cache == null) ||
306 (population == null) ||
307 (log == null))
308 { throw new IllegalArgumentException(); }
309
310 final long startTime = System.currentTimeMillis();
311 // We'll extend our finish time if we made progress during the original time slice.
312 final long extendedEndTime = endTime + Math.max(100, 2*(endTime-startTime));
313
314 // Pick breeding set to work on.
315 // Allow slightly stale values for speed and robustness.
316 final List<String> breedingSet = pickBreedingSet(cache, population, true);
317 if(breedingSet == null) { return(false); /* No suitable breeding stock... */ }
318
319 // Are we optimising for error against a small calibration subset?
320 final boolean useCalibSubset = (calibSubset != null) && !calibSubset.isEmpty();
321 // This value is guaranteed to be non-null and non-empty if useCalibSubset is true.
322 final Map<Name.ExhibitShort, ScoreAndConf> calib = (!useCalibSubset) ? null : calibSubset;
323 assert((!useCalibSubset) || ((calib != null) && (!calib.isEmpty()))) : "calib must be null or non-empty";
324
325 final AllExhibitImmutableData aeid = dataSource.getAllExhibitImmutableData(-1);
326
327 // Some of the time, fix some of the otherwise-variable parameters.
328 // we do this to allow the minimiser to focus its fire,
329 // have less chance of trying to adjust interacting parameters, etc.
330 // Only effective when the Scorer has 2 or more (continuously variable) parameters.
331 final boolean fixSomeParameters = Rnd.fastRnd.nextBoolean();
332
333 // If we're fixing some parameters then we can optionally fix all those
334 // at or very close to their end stops.
335 final boolean fixAllExtremeParameters = fixSomeParameters && Rnd.goodRnd.nextBoolean();
336
337 // Some of the time we perturb the initial Scorer we select from the breeding set.
338 // We do this to try to move out of an existing local minimum,
339 // especially as some perturbations may be quite large.
340 final boolean perturbInitialScorer = Rnd.goodRnd.nextBoolean();
341
342 // Randomly select an extant Scorer,
343 // with selection skewed towards "better" Scorers from the available breeding set.
344 final String extant0 = breedingSet.get(
345 Rnd.goodRnd.nextInt(1 + Rnd.goodRnd.nextInt(breedingSet.size())));
346 // Create the Scorer instance with its parameters parsed.
347 final ScorerIF initialScorer0 = cache.getScorerInstance(extant0);
348 assert(initialScorer0 != null) : "must be able to recreate Scorer instance from breeding set SNP: "+extant0; // Must be able to create Scorer from value in extant population.
349 // Note the initial score/goodness.
350 final ScoreAndConf sacExtant0 = cache.computeScorerWeighting(extant0, true, null);
351 // Some of the time, perturb the parameters before we start.
352 // This may help get us out of any local minimum, ie a rut...
353 final String extant;
354 final ScorerIF initialScorer;
355 final ScoreAndConf sacExtant;
356 if(perturbInitialScorer)
357 {
358 // Use perturbed version of extant Scorer.
359 initialScorer = initialScorer0.createPerturbedVariant();
360 extant = initialScorer0.getNameAndParameters();
361 // extant = initialScorer0.getPerturbedNameAndParameters();
362 // initialScorer = cache.getScorerInstance(extant);
363 assert(initialScorer != null) : "must be able to recreate Scorer instance from perturbed SNP: "+extant; // Must be able to create instance for this Scorer variation.
364 sacExtant = cache.computeScorerWeighting(extant, true, SSOURCE_GA); // Simple mutation...
365 }
366 else
367 {
368 // Use selected extant Scorer as-is.
369 initialScorer = initialScorer0;
370 extant = extant0;
371 sacExtant = sacExtant0;
372 }
373 // Compute the goodness of our initial/starting Scorer.
374 final int goodnessExtant = ScoreAndConf.computeScorerGoodness(sacExtant);
375 //if(IsDebug.isDebug) { System.out.println("[ScorerCreator.createNewByOpt(): working on "+extant+"...]"); }
376
377 // Ensure fast in-order and random access to the original parameter definitions.
378 // This assumes that all parameters have unique names.
379 // This must not be altered.
380 final List<ScorerParam> paramDefsAndValuesOriginal = Collections.unmodifiableList(new ArrayList<ScorerParam>(initialScorer.getParameterDefsAndValues()));
381 final int nParams = paramDefsAndValuesOriginal.size();
382
383 // Map from from MnUserParameters params index to paramDefsAndValuesOriginal position.
384 // This makes it easy to simply take a copy of the original
385 // and then overwrite controlled parameter values in place.
386 // We allow this to be over-long...
387 final int mnIndexToParamIndex[] = new int[nParams];
388
389 // // We precompute the expanded versions of all the rest
390 // // to make it relatively quick to assemble a modified parameter set.
391 // final String fixedParams[] = new String[nParams];
392
393 // Create the parameter set,
394 // including only those parameter types with reasonably continuous values,
395 // ie including integer parameters and excluding enumeration parameters.
396 final MnUserParameters params = new MnUserParameters();
397 int mnParamNum = 0;
398 for(int i = 0; i < nParams; ++i)
399 {
400 final ScorerParam p = paramDefsAndValuesOriginal.get(i);
401 assert(p != null) : "parameter cannot be null in paramDefsAndValuesOriginal from "+initialScorer;
402 final String name = p.getName();
403
404 // Allow the minimiser to vary integer-valued parameters.
405 // If we're fixing extreme parameters (which may hinder the minimiser)
406 // then we simply don't vary any parameter within a delta of its limits.
407 if(p instanceof ScorerParamInteger)
408 {
409 final ScorerParamInteger pi = (ScorerParamInteger) p;
410 // Create this as a variable parameter, if:
411 // * not fixing extreme-valued parameters, OR
412 // * not within a delta of the extremes (all but AT the extremes for delta==1).
413 if((!fixAllExtremeParameters) ||
414 ((pi.value - pi.min >= pi.delta) && (pi.max - pi.value >= pi.delta)))
415 {
416 // We set the error tolerance similar the "delta" value
417 // since it is assumed to represent a "small" non-critical change.
418 // We use a minimum error value of 1 to reflect the int granularity.
419 params.add(name, pi.value, (pi.delta < 2) ? 1 : pi.delta, pi.min, pi.max);
420 mnIndexToParamIndex[mnParamNum++] = i; // Provide mapping.
421 continue;
422 }
423 }
424
425 // // For a parameter that cannot be varied (reasonably) smoothly,
426 // // precompute its text representation for speed later.
427 // fixedParams[i] = p.toNameValueString();
428 }
429
430 // If we are fixing some otherwise-variable parameters, we do it here.
431 // We do this only if there is more than one variable parameter.
432 final int variableParamCount = mnParamNum;
433 assert(params.variableParameters() == variableParamCount) : "params.variableParameters() == variableParamCount";
434 if(fixSomeParameters && (variableParamCount > 1))
435 {
436 // We fix at least one of, but never all of, the parameters.
437 // We skew towards leaving few parameters variable.
438 final int parametersToFix = variableParamCount - 1 -
439 Rnd.goodRnd.nextInt(1+Rnd.goodRnd.nextInt(variableParamCount-1));
440 assert((parametersToFix > 0) && (parametersToFix < variableParamCount)) : "must not fix all params nor none";
441 //if(IsDebug.isDebug) { log.log("[Fixing "+parametersToFix+" parameter(s) out of "+variableParamCount+"...]"); }
442 // Collection of indices of parameters to fix.
443 final List<Integer> toFix = new ArrayList<Integer>(variableParamCount);
444 for(int i = variableParamCount; --i >= 0; ) { toFix.add(i); }
445 Collections.shuffle(toFix, Rnd.goodRnd);
446 for(int i = parametersToFix; --i >= 0; )
447 { params.fix(toFix.get(i)); }
448 assert(params.variableParameters() == variableParamCount - parametersToFix) : "params.variableParameters() == variableParamCount - parametersToFix";
449 }
450 //if(IsDebug.isDebug) { log.log("[Minimiser: varying parameters "+params.variableParameters()+" out of "+nParams+"...]"); }
451
452 // Local error-signalling class
453 // (to wrap a legit IOException and propagate up through the minimiser).
454 @SuppressWarnings("serial")
455 class WrappedIOException extends RuntimeException
456 { WrappedIOException(final IOException cause) { super(cause); } }
457
458 // Set true if we find a new Scorer.
459 final AtomicBoolean foundNewScorer = new AtomicBoolean();
460
461 // Set true if we find a Scorer better than our initial one.
462 final AtomicBoolean foundBetterNewScorer = new AtomicBoolean();
463 //
464 // // Set true if we found the best-so-far Scorer at least once during this run.
465 // final AtomicBoolean foundBest = new AtomicBoolean();
466
467 // Maximum error that we can generate in the evaluation functor.
468 final int maxERR = ScoreAndConf.MAX_GOODNESS;
469 assert(maxERR > 0) : "maxERR must be strictly positive as an int";
470
471 // Wrap our Scorer-evaluation routine as a standard functor.
472 // We remember all values so far computed on this evaluation
473 // for speed and consistency.
474 // We are prepared to abort the calculation once:
475 // * we have passed our target end time, and,
476 // * we have found a new Scorer parameter set better than our starting set.
477 // Better means BOTH higher "goodness" AND higher correlation.
478 final FCNBase fcn = (new FCNBase() {
479 /**Map of values so-far requested/computed (thread-safe). */
480 private final ConcurrentMap<String,Integer> answers = new ConcurrentHashMap<String, Integer>();
481 /**Return the "error" of the Scorer, to be minimised. */
482 public double valueOf(final double[] par)
483 {
484 final ScorerIF toEvalSc = constructModifiedSNP(par, initialScorer, paramDefsAndValuesOriginal, mnIndexToParamIndex);
485 final String toEvalS = toEvalSc.getNameAndParameters();
486 // After rounding we may find that we have a previous answer to hand
487 // if so, use it for consistency and speed.
488 final Integer cachedAnswer = answers.get(toEvalS);
489 if(cachedAnswer != null) { return(cachedAnswer); }
490 // Find out if this Scorer is novel/new.
491 final boolean isNew = (population.getScorerWeighting(toEvalSc, true) == null);
492 try
493 {
494 int goodness = Integer.MIN_VALUE;
495 // We can stop if we've found a new Scorer better than we started with.
496 // This helps limit the execution time while still ensuring progress.
497 if((!useCalibSubset) || isNew)
498 {
499 // Allow use of stale/cached values here for speed and robustness
500 // with the risk that an inconsistent mix of stale and up-to-date values
501 // may make finding the minimum error difficult or impossible at times.
502 final ScoreAndConf sac = cache.computeScorerWeighting(toEvalSc, true, SSOURCE_MIN);
503 goodness = ScoreAndConf.computeScorerGoodness(sac);
504
505 // Check that new Scorer is good enough to remain in the general population.
506 if(isNew && (population.getScorerWeighting(toEvalSc, false) != null))
507 {
508 // Note if we generated a useful novel Scorer
509 // (ie good enough to have the overall routine return true).
510 foundNewScorer.set(true);
511 // If our new Scorer is better than we started with then this is progress.
512 // We have a strict notion of "better" which is both the "goodness" measure
513 // AND the underlying accuracy (which is more than we are minimising for).
514 if((goodness > goodnessExtant) && (sac.score > sacExtant.score))
515 { foundBetterNewScorer.set(true); }
516 }
517 }
518
519 // Check if we need to abort the minimiser because we've run out of time.
520 // If we haven't found anything better than we started with and are now out of time, abort.
521 if(System.currentTimeMillis() > (foundBetterNewScorer.get() ? extendedEndTime : endTime))
522 {
523 //if(IsDebug.isDebug) { System.out.println("[ScorerCreator.createNewByOpt(): aborting minimiser having run out of time.]"); }
524 throw new InterruptedIOException("aborting minimiser: out of time");
525 }
526 // If this thread has been interrupted, abort immediately.
527 if(Thread.interrupted())
528 { throw new InterruptedIOException("thread interrupted"); }
529
530 // If using the calibration set then use its measure of goodness instead for the error.
531 // Convert score*confidence into (non-positive) error value (to be minimised).
532 // The better the score, the lower the error.
533 // MAX^2 s.c => 0 error; -MAX^2 s.c => maxERR +ve error.
534 // We make sure that we use the same metric that ScorerPopulation does.
535 final int error = maxERR - (((!useCalibSubset) ? goodness :
536 ScoreAndConf.computeScorerGoodness(ScorerCreator.computeWeighting(calib, cache, toEvalSc, aeid, true).first)));
537 //if(IsDebug.isDebug) { System.out.println("[ScorerCreator.createNewByOpt(): error="+(error)+" having evaluated "+toEval+"...]"); }
538 assert((error >= 0) && (error <= 2*maxERR)) : ("Error expected to be in range [0,"+(2*maxERR)+"] but was "+error) ;
539 answers.putIfAbsent(toEvalS, error); // Cache this answer.
540 return(error);
541 }
542 catch(final IOException e)
543 {
544 // Wrap the original exception and rethrow (as our unchecked exception)
545 // which will abort the minimisation relatively gracefully.
546 throw new WrappedIOException(e);
547 }
548 }
549 });
550
551 // Select the general strategy.
552 // We select the strategy randomly to maximise our chance of success.
553 // Skew strategy selection heavily towards "fast" (0) rather than "safe" (2),
554 // since this will be a rather hard function to handle anyway.
555 final MnStrategy strategy = new MnStrategy(Rnd.fastRnd.nextInt(1+Rnd.fastRnd.nextInt(1+Rnd.fastRnd.nextInt(3))));
556
557 // Create the minimiser instance...
558 final MnMinimize minimiser = new MnMinimize(fcn, params, strategy.strategy());
559 // Warn the minimiser that we don't in practice have much intermediate calculation precision.
560 minimiser.setPrecision(1.5/ScoreAndConf.MAX); // Approx limit of precision in score, for example.
561 // Limit the number of function calls that can be made (as they may be very slow)
562 // and we may not have space to retain all the useful intermediate results.
563 // Allow more calls with more parameters.
564 final int maxFnCalls = Math.min(1024 * nParams, population.maxSize()/2); // Crude limits...
565 // Specify a tolerance >> than the integer granularity (1) of the error value.
566 final int resultTolerance = maxERR/100; // Getting within 1% of optimum is just fine!
567 // ...and attempt to find the minimum.
568 // Note that we don't mind too much if the minimiser doesn't think that it finished,
569 // since it may have gotten us some way in the right direction anyway...
570 // final long minimiserStartTime = System.currentTimeMillis();
571 try
572 {
573 final FunctionMinimum minimum = minimiser.minimize(maxFnCalls, resultTolerance);
574 assert(minimum != null) : "function minimum should be non-null";
575 // Don't try to use the function minimum at all if invalid.
576 if(!minimum.isValid()) { return(foundNewScorer.get()); }
577
578 final double par[] = minimiser.params();
579 assert(par != null) : "minimiser.params() should not be null";
580 assert(fixSomeParameters || (par.length == minimum.userParameters().variableParameters())) : "unless fixing parameters, par.length should == minimum.userParameters().variableParameters()";
581 // This is the reconstructed "best" Scorer found so far.
582 final ScorerIF newVariant = constructModifiedSNP(par, initialScorer, paramDefsAndValuesOriginal, mnIndexToParamIndex);
583
584 // Check that this Scorer name-and-parameters value is new.
585 // Note that evaluating the scorer internally may already have cached a result,
586 // so we regard finding a new value as successful if the minimisation succeeded,
587 // or if we had made some progress en passant...
588 if(population.getScorerWeighting(newVariant, true) != null)
589 {
590 return(foundNewScorer.get() ||
591 (minimum.isValid() && !newVariant.getParameterDefsAndValues().equals(extant) &&
592 !newVariant.getParameterDefsAndValues().equals(extant0)));
593 }
594
595 // Evaluate new variant, automatically recording/sharing if good.
596 // Force calculation using fully up-to-date values.
597 /* final ScoreAndConf sac = */ cache.computeScorerWeighting(newVariant, false, SSOURCE_MIN);
598 return(true); // Successfully created and evaluated new creature.
599 }
600 catch(final IOException e) { throw e; /* Propagate I/O problem as-is. */ }
601 catch(final WrappedIOException e)
602 {
603 if(foundNewScorer.get()) { return(true); /* Made progress anyway. */ }
604 // Didn't make progress, so rethrow the exception.
605 if(e.getCause() == null) { throw new IOException("WrappedIOException had null cause"); }
606 throw (IOException) e.getCause(); /* Unwrap and rethrow underlying IOException. */
607 }
608 // finally
609 // {
610 // if(foundGoodEnoughNewScorer.get())
611 // {
612 // final long runTime = System.currentTimeMillis() - startTime;
613 ///* if(IsDebug.isDebug) */ { log.log("[Minimiser: made progress in "+(runTime)+"ms: varied parameters "+params.variableParameters()+" out of "+nParams+"...]"); }
614 // }
615 // }
616 }
617
618
619 /**Class to encapsulate all background and evolution work for a given ScorerCache.
620 * The doChunk() entry point is designed to be thread-safe and highly threadable,
621 * ie can be called from multiple worker threads at once.
622 */
623 public static final class ScorerWork
624 {
625 /**Create instance.
626 * @param scorerCache cache of Scorers to work with; never null
627 * @param log logger; never null
628 * @param allowScorerSharingByEvent if true,
629 * then we try to retrieve persisted/shared Scorers
630 * from the system variables event mechanism
631 */
632 public ScorerWork(final AbstractScorerCache scorerCache,
633 final SimpleLoggerIF log,
634 final boolean allowScorerSharingByEvent)
635 {
636 if((scorerCache == null) || (log == null))
637 { throw new IllegalArgumentException(); }
638 this.scorerCache = scorerCache;
639 this.log = log;
640 this.allowScorerSharingByEvent = allowScorerSharingByEvent;
641 }
642
643 /**Underlying Scorer cache; never null. */
644 private final AbstractScorerCache scorerCache;
645
646 /**Logger; never null. */
647 private final SimpleLoggerIF log;
648
649 /**If true, then we try to retrieve persisted/shared Scorers from the system variables event mechanism. */
650 private final boolean allowScorerSharingByEvent;
651
652 /**Excess time spent running slow evolution methods (ns).
653 * In order to attempt to balance the amount of time
654 * spent on different methods of generating new Scorers,
655 * given that they can be orders of magnitude apart in execution time,
656 * we attempt to balance the total run-time they take.
657 * <p>
658 * This is increased by the execution time of each "slow" instance
659 * and decreased by the execution time of each "fast" instance.
660 * Slow methods may only start when it is negative, fast ones otherwise.
661 * <p>
662 * Run-time for "failed" executions may or may not be included.
663 * <p>
664 * This starts at zero, and the effect of the "fair share" scheduling that this controls
665 * should be to keep it as close to zero (on either side of zero) as possible.
666 * <p>
667 * This is thread-safe.
668 */
669 private final AtomicLong excessSlowTimeNs = new AtomicLong();
670
671 /**Run one chunk of work.
672 * Note that if the minTime or minEndTime values are not more than 1ms beyond 'now'
673 * then this avoids computations that are likely to require especially large
674 * amounts of CPU time/effort.
675 *
676 * @param special if true, do extra background work and normal work
677 * @param inboundEval if non-null then is queue of externally-supplied Scorers
678 * to evaluate and insert into the population if appropriate;
679 * these are regarded as potentially untrusted so are validated and canonicalised before use
680 * @param endTime target end time for this chunk of work to stop by
681 * @param minEndTime time before which we should not stop (ie target minimum quantum)
682 */
683 public final void doChunk(final long endTime, final long minEndTime,
684 final boolean special,
685 final Queue<String> inboundEval)
686 {
687 final ScorerPopulation population = scorerCache.getPopulation();
688
689 // We do some extra housekeeping work first in "special" threads
690 // or when we need to bootstrap.
691 if(special || (population.size() == 0))
692 {
693 if(IsDebug.isDebug && (population.size() == 0)) { System.out.println("ScorerCreator.doChunk(): bootstrapping empty population"); }
694 // Make sure that we get base and best Scorers up to date and keep them so.
695 // This will also ensure that all sorts of supporting data is kept fresh too.
696 final ArrayList<String> shuffledBaseNames = new ArrayList<String>(scorerCache.getBaseScorersWithoutParameters());
697 // Getting these 'best' values should not cause any exceptions or delay.
698 // Re-evaluating these should ensure that the best deserve to remain so,
699 // else incrementally push them out of the way to allow replacements.
700 shuffledBaseNames.addAll(scorerCache.getCurrentScorersWithParameters(Rnd.fastRnd.nextBoolean()));
701 // Shuffle the Scorers to give a fair/equal crack of the whip to all-comers.
702 Collections.shuffle(shuffledBaseNames, Rnd.fastRnd);
703 for(final String scorerName : shuffledBaseNames)
704 {
705 if(IsDebug.isDebug && (population.size() == 0)) { System.out.println("ScorerCreator.doChunk(): bootstrapping empty population with: "+scorerName); }
706 // Attempt to bring the selected Scorer weighting fully up-to-date
707 // if not already so.
708 try { scorerCache.computeScorerWeighting(scorerName, Rnd.fastRnd.nextBoolean(), null); }
709 catch(final IOException e) { /* Ignore probably-transient I/O problem. */ }
710 catch(final Exception e)
711 {
712 // Stop on unexpected error.
713 e.printStackTrace();
714 System.err.println("Problem with computeScorerWeighting("+scorerName+", false): " + e.getMessage());
715 }
716 // Stop and exit this run immediately if we've exhausted our time chunk...
717 // This forces us to get the basics in place and up-to-date first.
718 if(System.currentTimeMillis() > endTime) { return; }
719 }
720 }
721
722 // If there is still an empty population then stop this work chunk immediately
723 // since there is nothing to 'evolve' from,
724 // unless we have inbound queued Scorers
725 // or we are permitted to import Scorers from system events.
726 if((population.size() == 0) && (!allowScorerSharingByEvent) &&
727 ((inboundEval == null) || inboundEval.isEmpty()))
728 { return; }
729
730 // Because the optimiser/minimiser method is possibly hundreds of times slower
731 // than the normal GA (select/combine/mutate) method,
732 // we attempt to force some type of fair-share of CPU time post-hoc.
733 // By selecting outside the loop "fast"/"slow",
734 // we reduce the chance of accidentally starting lots of slow methods at once.
735 // We avoid slow methods where our available time quantum seems very small
736 // at least in relation to our ability to measure time accurately.
737 // TODO: We might also avoid slow methods in power-conserving mode,
738 // since otherwise we may uninterruptibly need a large chunk of energy.
739 final boolean doSlowMethods = (excessSlowTimeNs.get() < 0) &&
740 (minEndTime - System.currentTimeMillis() > 1) &&
741 (endTime - System.currentTimeMillis() > 1);
742 // If using slow methods (eg minimisation)
743 // then some of the time optimise/minimise against a small calibration subset for speed.
744 // If we encounter any problem trying to extract the calibration set, we do without.
745 final boolean useCalibSubset = doSlowMethods && Rnd.fastRnd.nextBoolean();
746 Map<Name.ExhibitShort, ScoreAndConf> calibSubset = null;
747 if(useCalibSubset)
748 {
749 // Try to generate a generic (and large-ish if possible) calibration set
750 // (that reflects strengths/weaknesses amongst all available Scorers)
751 // since we don't know at this point which Scorer we will be tuning.
752 try { calibSubset = scorerCache.extractCalibrationSet(null, ScorerCreator.SUGGESTED_CALIB_SET_SIZE, null, true); }
753 catch(final IOException e) { /* Ignore error and do without calibration subset. */ }
754 }
755
756 // Try to create several new/good Scorers values
757 // or to import several good saved Scorers from local or global state.
758 // Do extra rounds if we've nowhere-near filled up our population/gene pool yet
759 // and/or if computation is going very quickly.
760 final int maxRounds = Math.min(1+SystemVariables.EVENT_SAMPLES_RETAINED,
761 Math.max(3, (population.maxSize() / Math.max(1, population.size()))));
762 int roundsDone = 0; // Decremented on successful creation/import of new variant.
763 try
764 {
765 // Continue until we've made made visible progress
766 // or at least until we've being trying for a reasonable time,
767 // but stop as soon as we pass our target end time or we are interrupted.
768 while(((roundsDone < maxRounds) || (System.currentTimeMillis() <= minEndTime)) &&
769 (System.currentTimeMillis() <= endTime) && (!Thread.interrupted()))
770 {
771 // Attempt to create new Scorer variations
772 // or import extant Scorers from local/global event histories or outside.
773 // In each case, these routines should only return true
774 // if fairly sure that they have made useful progress.
775 if(Rnd.fastRnd.nextBoolean()) // Try to 'import' half the time...
776 {
777 if(allowScorerSharingByEvent && Rnd.fastRnd.nextBoolean())
778 {
779 // Try to fetch a new top Scorer from another population
780 // or find a "golden oldie" from local/global history.
781 // Fast.
782 if(ScorerCreator.retrieveOldScorer(scorerCache.getDataSource(), scorerCache, population, log))
783 { ++roundsDone; }
784 }
785 else if(inboundEval != null)
786 {
787 final String next = inboundEval.poll(); // Non-blocking read.
788 // Count this as successful 'round'
789 // if there is a new inbound Scorer whose canonicalised form is new
790 // and yields a positive goodness.
791 if(next != null)
792 {
793 try
794 {
795 final ScorerIF sc = scorerCache.getScorerInstance(next);
796 if(sc != null)
797 {
798 final String canon = AbstractScorer.canonicalise(sc);
799 if(population.getScorerWeighting(canon, true) == null)
800 {
801 final ScoreAndConf newSAC = scorerCache.computeScorerWeighting(canon, false, "inbound");
802 if(ScoreAndConf.computeScorerGoodness(newSAC) > 0)
803 { ++roundsDone; }
804 }
805 }
806 }
807 catch(final Exception e)
808 { /* Ignore errors from potentially-untrusted inbound Scorer values. */ }
809 }
810 }
811 }
812 else // Try to 'create' half the time.
813 {
814 final long startTime = System.nanoTime();
815 if(!doSlowMethods)
816 {
817 try
818 {
819 // Incremental (quick) creation of new Scorer by GA methods:
820 // selection, combination, mutation.
821 // Fast.
822 if(ScorerCreator.createNewByGA(scorerCache, population, log))
823 { ++roundsDone; }
824 }
825 finally { excessSlowTimeNs.addAndGet(startTime - System.nanoTime()); /* Subtract run time. */ }
826 }
827 else
828 {
829 try
830 {
831 // Potentially slow minimiser-based method, seeking "optimum" parameters.
832 // Can probably execute only once on each run.
833 // Each run may generate many novel Scorer values, however.
834 if(ScorerCreator.createNewByOpt(scorerCache.getDataSource(), scorerCache, population, calibSubset, log, endTime))
835 { ++roundsDone; }
836 }
837 finally { excessSlowTimeNs.addAndGet(System.nanoTime() - startTime); /* Add run time. */ }
838 }
839 }
840
841 // If memory seems to be under huge strain
842 // or the system is conserving CPU or power
843 // then abort once we have apparently made some progress
844 // (to avoid system/memory stress and starvation to no end).
845 if((roundsDone > 0) &&
846 (MemoryTools.isMemoryStressed() || GenUtils.mustConserveCPU() || GenUtils.mustConservePower()))
847 { return; }
848 }
849 }
850 catch(final IOException e) { /* Stop this run on probably-transient I/O problem. */ }
851 catch(final Throwable t) { t.printStackTrace(); /* Stop this run on serious error. */ }
852 }
853 }
854
855
856 /**Select at random a best-first breeding set of one Scorer base type; null if none available.
857 * Returns the non-empty breeding set of a parameterised Scorer selected at random,
858 * or null if no such breeding set is available.
859 * <p>
860 * We weight this more heavily towards breeding sets whose best constituent has a high "goodness".
861 *
862 * @param population the extant population; never null
863 *
864 * @return null or in-order (best-first) non-empty breeding set of same Scorer type
865 */
866 private static List<String> pickBreedingSet(final ScorerCacheIF cache,
867 final ScorerPopulation population,
868 final boolean allowStale)
869 {
870 // Select at random one of the top or base Scorer names,
871 // though avoiding any that have no parameters or an empty breeding set.
872 // We ensure that the base Scorers are included to allow bootstrapping.
873 // If we have more entries for the bigger populations then than is a useful skew.
874 final List<String> topScorers = new ArrayList<String>(population.getCurrentScorersWithParameters(allowStale));
875 topScorers.addAll(cache.getBaseScorersWithoutParameters());
876
877 // Remove all clearly-ineligible Scorers,
878 // ie those without parameters.
879 for(final Iterator<String> it = topScorers.iterator(); it.hasNext(); )
880 {
881 final ScorerIF sc = cache.getScorerInstance(it.next());
882 final Collection<ScorerParam> parameterDefs = sc.getParameterDefsAndValues();
883 if(parameterDefs.isEmpty()) { it.remove(); } // No variation possible.
884 }
885 // Return null if no candidates left.
886 final int tsSize = topScorers.size();
887 if(tsSize == 0) { return(null); }
888
889 // Sort remainder by (decreasing) goodness, ie best-first.
890 Collections.sort(topScorers, new Comparator<String>() {
891 public int compare(final String o1, final String o2)
892 {
893 final ScoreAndConf sw1 = population.getScorerWeighting(o1, allowStale);
894 final int g1 = (sw1 != null) ? ScoreAndConf.computeScorerGoodness(sw1) : Integer.MIN_VALUE;
895 final ScoreAndConf sw2 = population.getScorerWeighting(o2, allowStale);
896 final int g2 = (sw2 != null) ? ScoreAndConf.computeScorerGoodness(sw2) : Integer.MIN_VALUE;
897 if(g1 < g2) { return(-1); }
898 if(g1 > g2) { return(+1); }
899 return(0); // Identical.
900 }
901 });
902
903 // Try to use breeding set skewed towards earlier (better) values.
904 // If our first choice is not available, work down the list (to less-good Scorers).
905 final int startIndex = Rnd.goodRnd.nextInt(1+Rnd.goodRnd.nextInt(tsSize));
906 for(int i = startIndex; i < tsSize; ++i)
907 {
908 final ScorerIF sc = cache.getScorerInstance(topScorers.get(i));
909 // final Collection<ScorerParam> parameterDefs = sc.getParameterDefsAndValues();
910 // if(parameterDefs.isEmpty()) { continue; } // No variation possible.
911 final List<String> bs = population.getBreedingSet(sc.getBaseName());
912 if(!bs.isEmpty()) { return(bs); }
913 }
914
915 // Fallback...
916 // If we could not get a breeding set with our preferred skewed weighting
917 // then shuffle the names randomly and return the first suitable entry.
918 Collections.shuffle(topScorers, Rnd.fastRnd);
919 for(final String snp : topScorers)
920 {
921 final ScorerIF sc = cache.getScorerInstance(snp);
922 // final Collection<ScorerParam> parameterDefs = sc.getParameterDefsAndValues();
923 // if(parameterDefs.isEmpty()) { continue; } // No variation possible.
924 final List<String> bs = population.getBreedingSet(sc.getBaseName());
925 if(!bs.isEmpty()) { return(bs); }
926 }
927
928 // There really was no suitable breeding sub-population at all.
929 return(null);
930 }
931
932 /**If true then monitor and report performance (CPU usage) in computeWeighting().
933 * Turn this on for (Scorer) performance tuning only...
934 * <p>
935 * This should give a reasonable view of the most compute-intensive Scorer workload.
936 */
937 private static final boolean PERF_SAMPLE_CW = false;
938
939 /**Cumulative performance stats (if any) for computeWeighting(); null iff none being collected. */
940 private static final ConcurrentHashMap<StackTraceElement, AtomicInteger> perfCountsCW = (!PERF_SAMPLE_CW) ? null : new ConcurrentHashMap<StackTraceElement, AtomicInteger>(1001);
941
942 /**Computes ScoreAndConf over the supplied calibration exhibits using the supplied Scorer cache; never null but may be (0,0) where the scorer is unknown or untested.
943 * This can be used as part of the computation of a Scorer's weighting.
944 * The input data must not change while this routine is running.
945 *
946 * @param calibrationData map from short exhibit names to external calibration data
947 * (eg actual user vote-based score); never null
948 *
949 * @return the score (first in the Pair) represents the correlation with the underlying votes
950 * (and whatever the scoring is measured against)
951 * with MAX meaning perfect correlation, 0 meaning no correlation,
952 * and -MAX meaning perfectly wrong answers all the time,
953 * and the confidence 0 if we have no (or very/too few) data points
954 * and approaching MAX as we have a large (enough) number of data points;
955 * the second element of the Pair is true if the the value is "partial"
956 * (we had to prematurely stop calculation or did not have enough data points)
957 * and so for example should not be cached
958 */
959 public static Tuple.Pair<ScoreAndConf,Boolean> computeWeighting(
960 final Map<Name.ExhibitShort, ScoreAndConf> calibrationData,
961 final ScorerCacheIF scorerCache,
962 final ScorerIF scorer,
963 final AllExhibitImmutableData aeid,
964 final boolean allowStale)
965 throws IOException
966 {
967 if((calibrationData == null) ||
968 (scorerCache == null) ||
969 (aeid == null) ||
970 (scorer == null))
971 { throw new IllegalArgumentException(); }
972
973 // Note start time so as to be able to limit effort/time spent in this call.
974 final long startTime = System.currentTimeMillis();
975
976 // If there is no calibration data then we cannot compute (nor cache) a result.
977 if(calibrationData.isEmpty()) { return(new Tuple.Pair<ScoreAndConf, Boolean>(ScoreAndConf.NO_OPINION, true)); }
978
979 // If there are no exhibits then we cannot compute (nor cache) a result.
980 if(aeid.length == 0) { return(new Tuple.Pair<ScoreAndConf, Boolean>(ScoreAndConf.NO_OPINION, true)); }
981
982 // Start performance sampling/monitoring, if any.
983 Thread perfMonitorThread = null;
984 if(PERF_SAMPLE_CW)
985 {
986 perfMonitorThread = GenUtils.startThreadPerfMonitor(
987 Thread.currentThread(),
988 perfCountsCW,
989 null, // "org.hd.d.pg2k.ai.",
990 "org.hd.d.pg2k.", // Monitor thread for at most 10s.
991 System.currentTimeMillis() + 10000, 10); // Sample often enough to obtain a decent profile.
992 }
993 try
994 {
995 // Last captured error, and error count, if any.
996 IOException err = null;
997 int errorCount = 0;
998 // If we didn't finish then we should not cache the result.
999 boolean stoppedPrematurely = false;
1000 // Compute map from short name to score computed by this Scorer.
1001 final Map<Name.ExhibitShort, ScoreAndConf> scorerResults = new HashMap<Name.ExhibitShort, ScoreAndConf>(
1002 2 * calibrationData.size());
1003 // Randomise the order in which we do the computations
1004 // to better queue background work, avoid systemic and order-dependent errors, etc.
1005 final List<Name.ExhibitShort> names = new ArrayList<Name.ExhibitShort>(calibrationData.keySet());
1006 if(Rnd.fastRnd.nextBoolean())
1007 { Collections.shuffle(names, Rnd.fastRnd); }
1008 for(final Name.ExhibitShort shortName : names)
1009 {
1010 if(!aeid.isPresent(shortName)) { continue; }
1011 final Name.ExhibitFull fullName = shortName.getFullName();
1012 // // If the exhibit is missing (eg has been deleted or renamed) then skip it.
1013 // final Name.ExhibitFull fullName = aeid.getFullName(shortName);
1014 // if(fullName == null) { continue; }
1015
1016 try
1017 {
1018 // Compute raw Scorer result for this exhibit.
1019 // Note that this may quickly return zero confidence
1020 // therefore meaning we can skip further work for this exhibit
1021 // since this Scorer may not be suitable for this exhibit type.
1022 final ScoreAndConf sac = scorerCache
1023 .computeUnweightedScoreAndConfidence(fullName,
1024 scorer, allowStale);
1025 // If confidence is zero then skip this exhibit.
1026 if(sac.confidence == 0) { continue; }
1027
1028 // Record this non-zero-confidence result.
1029 scorerResults.put(shortName, sac);
1030 }
1031 catch(final IOException e)
1032 {
1033 // Capture the error.
1034 ++errorCount;
1035 err = e;
1036
1037 // We are prepared to skip over a few missing data points
1038 // (ie ignore a small number of possibly-transient errors)
1039 // though then do not subsequently cache the result
1040 // unless there has been only a very small fraction of errors.
1041 // This may also allow requests (eg for thumbnails/events)
1042 // to be queued in the background ready for next time
1043 // while making computation generally more robust.
1044 final int usableExhibits = scorerResults.size();
1045 if((errorCount <= usableExhibits)
1046 || (System.currentTimeMillis() - startTime < CoreConsts.MAX_INTERACTIVE_DELAY_MS))
1047 { continue; }
1048
1049 // We've spent long enough: do calculations based on what we have so far...
1050 // If we've had to give up and we don't allow stale/partial results then abort...
1051 if(!allowStale) { throw e; }
1052 // ...else continue with what we have but don't cache it.
1053 stoppedPrematurely = true;
1054 break;
1055 }
1056 }
1057 // We will accept the result if there were far more data points
1058 // where we had a usable vote factor AND a non-zero Scorer confidence
1059 // than the count of errors that we hit (and points we should have included)
1060 // OR if there were some exhibits but none at all that we could use with this Scorer.
1061 final int usableExhibits = scorerResults.size();
1062 assert (aeid.length > 0);
1063 // We will accept/trust the result if there were far more data points
1064 // where we had a usable vote factor AND a non-zero Scorer confidence
1065 // than the count of errors that we hit (and points we should have included)
1066 // OR if there were some exhibits but none at all that we could use with this Scorer.
1067 // This value gets filled in after attempting to calibrate.
1068 final boolean enoughDataPoints = (errorCount <= usableExhibits
1069 / ScorerCreator.MIN_SUGGESTED_CALIB_SET_SIZE);
1070 // If not allowing stale/partial results then we must abort after too many errors.
1071 if(!allowStale && !enoughDataPoints) { throw err; }
1072 return(new Tuple.Pair<ScoreAndConf, Boolean>(
1073 ScorerCreator.computeWeighting(scorerResults, calibrationData),
1074 stoppedPrematurely || !enoughDataPoints));
1075 }
1076 finally
1077 {
1078 if(PERF_SAMPLE_CW && (null != perfMonitorThread))
1079 {
1080 try
1081 {
1082 GenUtils.stopPerfMonitorandDumpSamples(perfMonitorThread,
1083 "computeWeighting()",
1084 perfCountsCW,
1085 null,
1086 20, GenUtils.systemOutLogger); // Log to system out...
1087 }
1088 catch(final InterruptedException e)
1089 {
1090 Thread.currentThread().interrupt(); // Don't lose interruptedness...
1091 }
1092 }
1093 }
1094 }
1095
1096 /**Computes ScoreAndConf over the supplied calibration exhibits; never null but may be (0,0) where the scorer is unknown or untested.
1097 * This can be used as part of the computation of a Scorer's weighting.
1098 * <p>
1099 * All keys in the calibration data should map to non-null values in the ratings map
1100 * (ie the ratings keys can be a superset of the calibration keys);
1101 * missing values will diminish the confidence of the result.
1102 * <p>
1103 * The input data must not change while this routine is running.
1104 *
1105 * @param scorerResult map from short exhibit name to Scorer's rating/result for the exhibit;
1106 * never null
1107 * @param calibrationData map from short exhibit name to external calibration data
1108 * (eg actual user vote-based score); never null
1109 *
1110 * @return the score represents the correlation with the underlying votes
1111 * (and whatever the scoring is measured against)
1112 * with MAX meaning perfect correlation, 0 meaning no correlation,
1113 * and -MAX meaning perfectly wrong answers all the time,
1114 * and the confidence 0 if we have no (or very/too few) data points
1115 * and approaching MAX as we have a large (enough) number of data points
1116 */
1117 public static ScoreAndConf computeWeighting(final Map<Name.ExhibitShort, ScoreAndConf> scorerResult,
1118 final Map<Name.ExhibitShort, ScoreAndConf> calibrationData)
1119 {
1120 // If there are no results or no calibration data then we cannot generate a confident answer.
1121 if(scorerResult.isEmpty() || calibrationData.isEmpty())
1122 { return(ScoreAndConf.NO_OPINION); }
1123
1124 // Exhibits which were usable to calibrate this Scorer.
1125 int usableExhibits = 0;
1126 // The total squared error (and confidence) in the range [0, (2*MAX)^2].
1127 // Each error term is multiplied by (moderated downwards by)
1128 // the product of the Scorer and vote-Factor confidence/correlation.
1129 assert((long) ScoreAndConf.MAX * (long) ScoreAndConf.MAX <= Integer.MAX_VALUE);
1130 long cumError = 0;
1131 // The dot product of the Scorer confidence and the vote confidence,
1132 // with theoretical range [0,Y] where Y = ScoreAndConf.MAX^2 * usableExhibits.
1133 long confDP = 0;
1134 for(final Name.ExhibitShort shortName : calibrationData.keySet())
1135 {
1136 // Get vote-based rating (cannot be null or have non-positive correlation).
1137 final ScoreAndConf vote = calibrationData.get(shortName);
1138 // Vote correlation must be strictly positive to be usable.
1139 if(vote.confidence == 0) { continue; }
1140
1141 // Compute raw Scorer result for this exhibit.
1142 // Note that this may quickly return zero confidence
1143 // thus meaning that we can skip further work for this exhibit
1144 // since this Scorer may not be suitable for this exhibit type.
1145 final ScoreAndConf sac = scorerResult.get(shortName);
1146 // If result is missing then sjip this exhibit.
1147 if(sac == null) { continue; }
1148 // If confidence is zero then skip this exhibit.
1149 if(sac.confidence == 0) { continue; }
1150
1151 final int conf = sac.confidence * vote.confidence;
1152 assert(conf >= 0); // Should be no overflow...
1153 // Skip zero or noisy (very-low-confidence) correlations.
1154 if(conf < 1) { continue; }
1155 confDP += conf;
1156 // Compute raw error (difference) in range [0,2*MAX].
1157 // The Scorer result is an actual predicted score for the exhibit.
1158 // The smaller the difference between this predicted score
1159 // and the human-vote-derived score the better.
1160 final int rawError = sac.score - vote.score;
1161 // Compute squared error times the (normalised to [0.0,1.0]) confidence.
1162 final long sqErrConf = ((rawError * (long)rawError) * conf) / (ScoreAndConf.MAX * ScoreAndConf.MAX);
1163 //if(IsDebug.isDebug) { System.out.println("[Error/diff="+error+" @ conf="+conf+" for "+scorerNameAndParameters+" on "+shortName+"; sac="+sac+"; voteFactor="+voteFactor+".]"); }
1164 assert(sqErrConf >= 0);
1165 cumError += sqErrConf;
1166 assert(cumError >= 0); // FIXME: Check that we never overflow.
1167 ++usableExhibits;
1168 }
1169
1170 ScoreAndConf result = ScoreAndConf.NO_OPINION; // Scorer unusable unless proven otherwise.
1171 // We can't compute a result with insufficient cumulative confidence.
1172 // We can cache a valid negative result though.
1173 if((usableExhibits > 0) && (confDP > 0))
1174 {
1175 final double svRangeConf = 1.0;
1176
1177 // Verify that accumulators are in their expected ranges.
1178 assert((confDP >= 0) && (confDP <= (usableExhibits * (long)ScoreAndConf.MAX*ScoreAndConf.MAX)));
1179 assert((cumError >= 0) && (cumError <= (usableExhibits * 4 * (long)ScoreAndConf.MAX*ScoreAndConf.MAX)));
1180
1181 // Compute RMS error in range [0,2*MAX].
1182 final double errorRMSRaw = Math.sqrt(cumError / (confDP / (double)(ScoreAndConf.MAX*ScoreAndConf.MAX)));
1183 assert((errorRMSRaw > -1) && (errorRMSRaw < 1 + 2*ScoreAndConf.MAX)); // We'll allow a little wiggle room for rounding errors...
1184 // Compute score in range +/- ScoreAndConf.MAX.
1185 final long score = ScoreAndConf.MAX - Math.round(errorRMSRaw);
1186 // Compute confidence in range [0,ScoreAndConf.MAX] (plus possible rounding errors).
1187 // We allow extra samples over our minimum desired number
1188 // to compensate for poor per-exhibit confidence.
1189 final long confidence = Math.min(ScoreAndConf.MAX,
1190 Math.round(svRangeConf * (confDP / (usableExhibits*ScoreAndConf.MAX)) *
1191 Math.sqrt(usableExhibits / (double) calibrationData.size())));
1192 result = new ScoreAndConf((short) Math.max(-ScoreAndConf.MAX, Math.min(ScoreAndConf.MAX, score)),
1193 (short) Math.max(0, confidence));
1194 }
1195
1196 return(result);
1197 }
1198
1199 /**Minimum suggested calibration-set size; strictly positive.
1200 * Note that if multiple distinct Scorer populations and target exhibit types are to be covered,
1201 * then a set of this size or greater should be gathered for each of them.
1202 * It may then be convenient to work with the union of those values
1203 * for all Scorer types.
1204 * <p>
1205 * This should be large enough that no one Scorer has enough parameters
1206 * to trivially map each different calibration exhibit to its correct score.
1207 * <p>
1208 * In general, while using a larger calibration sample set increases CPU and memory costs,
1209 * especially for remote 'AH' clients that may need to fetch data,
1210 * a larger calibration set should increase the reliability of calibrated results.
1211 * But there may be a huge advantage in using a set small enough
1212 * to keep in cache close to the top of the memory hierarchy.
1213 * <p>
1214 * We may want to try different calibration set sizes larger than the minimum
1215 * to improve the usefulness and predictive power of the calibration set.
1216 * <p>
1217 * A power of two may allow for generation of slightly better code in various places.
1218 * <p>
1219 * Experience suggests that this value should be at least 128.
1220 */
1221 public static final int MIN_SUGGESTED_CALIB_SET_SIZE = 128;
1222
1223 /**Server-side calibration set size; strictly positive and no less than MIN_SUGGESTED_CALIB_SET_SIZE.
1224 * A larger calibration set of this size implies use of more memory and CPU time,
1225 * but with a higher reliability of answer,
1226 * than using a calibration set of size MIN_SUGGESTED_CALIB_SET_SIZE.
1227 * <p>
1228 * Needs to be at least as large as the minimum 'voted-for' set to give good confidence.
1229 * <p>
1230 * A power of two may allow for generation of slightly better code.
1231 */
1232 public static final int SUGGESTED_CALIB_SET_SIZE =
1233 Math.max(ScorerCacheImpl.MIN_VOTED_FOR_SET_SIZE, 2*MIN_SUGGESTED_CALIB_SET_SIZE);
1234
1235 /**Tunnel root-relative (starting with '/') URL within the server; never null. */
1236 public static final String scorerTunnelRRURL = "/_AI/remote.jsp";
1237 /**Parameter name to request particular remote data type; never null nor empty. */
1238 public static final String DT_PARAM_NAME = "scDataType";
1239 /**Data type parameter value to request calibration-set data; never null nor empty. */
1240 public static final String DT_CALIB = "calibration";
1241 /**Data type parameter value to request server's best Scorers; never null nor empty. */
1242 public static final String DT_BESTSC = "bestScorers";
1243 /**Data type parameter value to POST new 'star' Scorer; never null nor empty. */
1244 public static final String DT_POSTNEWSC = "postNewScorer";
1245 /**Data type parameter name for generic data/query; never null nor empty. */
1246 public static final String DT__GEN_DATA = "q";
1247 }