001 /*
002 Copyright (c) 1996-2009, Damon Hart-Davis
003 All rights reserved.
004
005 Redistribution and use in source and binary forms, with or without
006 modification, are permitted provided that the following conditions are
007 met:
008
009 * Redistributions of source code must retain the above copyright
010 notice, this list of conditions and the following disclaimer.
011
012 * Redistributions in binary form must reproduce the above copyright
013 notice, this list of conditions and the following disclaimer in the
014 documentation and/or other materials provided with the
015 distribution.
016
017 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
018 IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
019 TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
020 PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
021 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
022 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
023 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
024 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
025 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
026 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
027 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
028 */
029
030 package org.hd.d.pg2k.ai.scorer;
031
032 import java.io.IOException;
033 import java.io.InterruptedIOException;
034 import java.util.ArrayList;
035 import java.util.Collection;
036 import java.util.Collections;
037 import java.util.Comparator;
038 import java.util.HashMap;
039 import java.util.HashSet;
040 import java.util.Iterator;
041 import java.util.List;
042 import java.util.Map;
043 import java.util.Queue;
044 import java.util.Set;
045 import java.util.concurrent.ConcurrentHashMap;
046 import java.util.concurrent.ConcurrentMap;
047 import java.util.concurrent.atomic.AtomicBoolean;
048 import java.util.concurrent.atomic.AtomicInteger;
049 import java.util.concurrent.atomic.AtomicLong;
050
051 import org.freehep.math.minuit.FCNBase;
052 import org.freehep.math.minuit.FunctionMinimum;
053 import org.freehep.math.minuit.MnMinimize;
054 import org.freehep.math.minuit.MnStrategy;
055 import org.freehep.math.minuit.MnUserParameters;
056 import org.hd.d.pg2k.svrCore.AllExhibitImmutableData;
057 import org.hd.d.pg2k.svrCore.CoreConsts;
058 import org.hd.d.pg2k.svrCore.GenUtils;
059 import org.hd.d.pg2k.svrCore.MemoryTools;
060 import org.hd.d.pg2k.svrCore.Name;
061 import org.hd.d.pg2k.svrCore.Rnd;
062 import org.hd.d.pg2k.svrCore.SimpleLoggerIF;
063 import org.hd.d.pg2k.svrCore.Tuple;
064 import org.hd.d.pg2k.svrCore.datasource.SimpleExhibitPipelineIF;
065 import org.hd.d.pg2k.svrCore.vars.EventPeriod;
066 import org.hd.d.pg2k.svrCore.vars.EventVariableValue;
067 import org.hd.d.pg2k.svrCore.vars.SimpleVariableDefinition;
068 import org.hd.d.pg2k.svrCore.vars.SystemVariables;
069
070 import ORG.hd.d.IsDebug;
071
072
073 /**Creates new Scorer (parameter-set) instances given an existing population.
074 *
075 * @author dhd
076 */
077 public final class ScorerCreator
078 {
079 /**Scorer 'source' for recovering from persistent/shared store; never empty nor null. */
080 public static final String SSOURCE_REC = "REC";
081
082 /**This method attempts to retrieve a "new" Scorer from the persisted history.
083 * This mechanism is used to recover previous known-good Scorers from local store and/or
084 * share good Scorers between all participating servers.
085 * This makes for faster start-up of new server instances,
086 * a local and global memory of good genotypes,
087 * and sharing of good genotypes between local populations.
088 *
089 * @return true if a Scorer was found that was not in the extant population
090 * (though may not have been good enough to remain in it)
091 */
092 public static boolean retrieveOldScorer(final SimpleExhibitPipelineIF dataSource,
093 final ScorerCacheIF cache,
094 final ScorerPopulation population,
095 final SimpleLoggerIF log)
096 throws IOException
097 {
098 // Choose between the longest and a less-long period
099 // to try to pick up more variety of persisted Scorer.
100 final EventPeriod intervalSelector = Rnd.fastRnd.nextBoolean() ?
101 EventPeriod.LONG : EventPeriod.VLONG;
102 // Choose a period from the previous full period to as-far-back-as-possible
103 // (for the given period/interval length)
104 // to improve long-term memory of Scorer values tests
105 // and to increase the chance of finding something rare,
106 // but with the choice heavily skewed to more recent events
107 // as more likely to be "better" or even present at all.
108 final long whichPeriod = intervalSelector.getIntervalNumber(System.currentTimeMillis()) -
109 Rnd.goodRnd.nextInt(1+Rnd.goodRnd.nextInt(1+Rnd.goodRnd.nextInt(SystemVariables.EVENT_SAMPLES_RETAINED)));
110 // Select from local (larger volume) and global (broadcast) results.
111 final SimpleVariableDefinition def = Rnd.fastRnd.nextBoolean() ?
112 SystemVariables.AI_SCORER_STRING_LOCAL_EVENT : SystemVariables.AI_SCORER_STRING_GLOBAL_EVENT;
113 final EventVariableValue[] evvs = dataSource.getEventValues(def, intervalSelector, whichPeriod, null);
114 if(evvs.length == 0) { return(false); }
115 EventVariableValue evv = evvs[0];
116 // If the selected (full, historical) period has no events
117 // then try the current (partial) period instead,
118 // which has the advantage of getting us bang up-to-date.
119 if((evv == null) || (evv.getTotalDistinctValues() == 0))
120 { evv = dataSource.getEventValue(def, intervalSelector, true); }
121 if((evv == null) || (evv.getTotalDistinctValues() == 0)) { return(false); }
122 // final ArrayList<Object> vals = new ArrayList<Object>(evv.getInfo().keySet());
123 // if(vals.isEmpty()) { return(false); }
124 // final String scorerNameAndStringParameters = (String) vals.get(Rnd.goodRnd.nextInt(vals.size()));
125 // Select an entry at random, though weighted more towards items that were "best" more often.
126 final String scorerNameAndStringParameters = (String) evv.getValueByRank(Rnd.goodRnd.nextInt(1+Rnd.goodRnd.nextInt(evv.getTotalDistinctValues())));
127 // Check that it is new...
128 if(population.getScorerWeighting(scorerNameAndStringParameters, true) != null) { return(false); }
129 // Evaluate/record/cache it if possible.
130 final ScoreAndConf sac = cache.computeScorerWeighting(scorerNameAndStringParameters, false, SSOURCE_REC);
131 // Check that it is good enough to remain in the population...
132 if(population.getScorerWeighting(scorerNameAndStringParameters, false) != null) { return(false); }
133 if(IsDebug.isDebug) { log.log("[ScorerCreator.retrieveOldScorer(): imported "+scorerNameAndStringParameters+" ScoreAndConfidence: "+sac+".]"); }
134 return(true); // Found (and imported/evaluated) a new Scorer not in our population.
135 }
136
137 /**Scorer 'source' for the GA combine/mutate method; never empty nor null. */
138 public static final String SSOURCE_GA = "GA";
139
140 /**Create a new Scorer (parameter set) from existing ones using normal GA techniques.
141 * This uses selection, mutation and recombination to attempt to find
142 * a "better" Scorer version based on those in the existing population.
143 *
144 * @param population the extant population; never null
145 *
146 * @return true iff we created and evaluated a new Scorer not in the extant population
147 */
148 public static boolean createNewByGA(final ScorerCacheIF cache,
149 final ScorerPopulation population,
150 final SimpleLoggerIF log)
151 throws IOException
152 {
153 // Pick breeding set to work on.
154 // Allow slightly stale values for speed and robustness.
155 final List<String> breedingSet = pickBreedingSet(cache, population, true);
156 if(breedingSet == null) { return(false); /* No suitable breeding stock... */ }
157
158 // Randomly select one or more different Scorers to mutate and combine.
159 final Set<String> parents = new HashSet<String>();
160 do
161 {
162 // Randomly select a Scorer, skewed towards "better" Scorers.
163 final String extant = breedingSet.get(
164 Rnd.goodRnd.nextInt(1 + Rnd.goodRnd.nextInt(breedingSet.size())));
165 // Add this mutant if different to extant parents (eg if no parents so far).
166 parents.add(extant);
167 // Stop if this parent has zero or one parameters,
168 // since multiple parents cannot be usefully "combined" sexually.
169 if(extant.indexOf(AbstractScorer.SEPARATOR) == extant.lastIndexOf(AbstractScorer.SEPARATOR))
170 { break; }
171 // We try not to draw too many samples from a small breeding set
172 // and we keep low the probability of using many parents.
173 } while((parents.size() < breedingSet.size()/4) && Rnd.goodRnd.nextBoolean());
174 assert(!parents.isEmpty());
175
176 final ScorerIF firstParent = cache.getScorerInstance(parents.iterator().next());
177
178 // Now make the new new variant/child by combining the parents if more than one
179 // and mutating the result.
180 // Else if there is exactly one parent then mutate it to get the new child.
181 final ScorerIF newVariant;
182 if(parents.size() == 1)
183 { newVariant = firstParent.createPerturbedVariant(); }
184 else
185 {
186 // Combine the parents' parameters
187 // and mutate the result/offspring.
188 final ArrayList<List<ScorerParam>> pt = new ArrayList<List<ScorerParam>>(parents.size());
189 for(final String snp : parents)
190 { pt.add(population.getScorerInstance(snp).getParameterDefsAndValues()); }
191 // Shuffle once to get a completely random order.
192 Collections.shuffle(pt, Rnd.goodRnd);
193 // Now, add one token at a time from the first parent,
194 // occasionally shuffling to switch the source of tokens
195 // to be from a different parent.
196 // We stop when the current/first parent is out of tokens.
197 // We force at least one crossover.
198 final int pt0len = pt.get(0).size();
199 final List<ScorerParam> newParams = new ArrayList<ScorerParam>(pt0len);
200 final int forcedCrossoverAfter = (pt0len <= 3) ? 1 : (1+Rnd.goodRnd.nextInt(pt0len-2));
201 // Note that each time, for whatever the current "0" Scorer is,
202 // we test against its length to safely allow for variable parameter numbers.
203 for(int tokenNumber = 0; tokenNumber < pt.get(0).size(); ++tokenNumber)
204 {
205 newParams.add(pt.get(0).get(tokenNumber));
206 // Occasionally switch to another parent.
207 // We make it quite unlikely to switch/crossover at any one point
208 // so as to give some notion of locality/clustering.
209 // FIXME: The frequency/probability of crossover may be very important.
210 if((forcedCrossoverAfter == tokenNumber) ||
211 (Rnd.fastRnd.nextBoolean() && Rnd.goodRnd.nextBoolean()))
212 {
213 final int ptSize = pt.size();
214 // Swap the current 0 item (which is the source of "genes")
215 // randomly with one of the others.
216 // This is analogous to a DNA crossover during meiosis.
217 final int swapIndex = (ptSize == 2) ? 1 : 1+Rnd.goodRnd.nextInt(ptSize-1);
218 final List<ScorerParam> old0 = pt.get(0);
219 pt.set(0, pt.get(swapIndex));
220 pt.set(swapIndex, old0);
221 }
222 }
223 // Get the un-mutated child.
224 final ScorerIF child = firstParent.createVariant(firstParent.getBaseName(), newParams);
225 final String childSNP = child.getNameAndParameters();
226 // Always mutate the child if identical to one parent
227 // or already exists in the population.
228 newVariant = (!parents.contains(childSNP) && (population.getScorerWeighting(child, true) == null)) ?
229 child /* Already new. */ :
230 child.createPerturbedVariant();
231 }
232
233 // Check that this is new (or at least not rescored recently).
234 if(population.getScorerWeighting(newVariant, true) != null) { return(false); }
235
236 //if(IsDebug.isDebug) { log.log("[ScorerCreator.createNewByGA(): trying new variant: " + newVariant + " of " + (new ArrayList<String>(parents)) + "...]"); }
237
238 // Evaluate new variant, automatically recording/sharing if good.
239 /* final ScoreAndConf sac = */ cache.computeScorerWeighting(newVariant, false, SSOURCE_GA);
240 // Check that it is good enough to remain in the population...
241 if(population.getScorerWeighting(newVariant, false) != null) { return(false); }
242 //if(IsDebug.isDebug) { log.log("[ScorerCreator.createNewByGA(): ...variant ScoreAndConfidence: " + sac + ".]"); }
243 return(true); // Successfully created and evaluated a new Scorer.
244 }
245
246
247 /**Create a variant Scorer given an extant same-type Scorer and the array of values of the variable parameters; never null.
248 */
249 private static ScorerIF constructModifiedSNP(final double[] par, final ScorerIF initialScorer, final List<ScorerParam> paramDefsAndValuesOriginal, final int[] mnIndexToParamIndex)
250 {
251 if((par == null) || (initialScorer == null) ||
252 (paramDefsAndValuesOriginal == null) || (mnIndexToParamIndex == null))
253 { throw new IllegalArgumentException(); }
254 // Copy the original parameter values,
255 // overwriting the changed variable parameters under control of the minimiser.
256 final List<ScorerParam> params = new ArrayList<ScorerParam>(paramDefsAndValuesOriginal);
257 for(int i = par.length; --i >= 0; )
258 {
259 final int origParamIndex = mnIndexToParamIndex[i];
260 final ScorerParam p = paramDefsAndValuesOriginal.get(origParamIndex);
261 if(p instanceof ScorerParamInteger)
262 {
263 final ScorerParamInteger pi = (ScorerParamInteger) p;
264 final int newValue = Math.min(pi.max, Math.max(pi.min, (int) Math.round(par[i])));
265 // Replace the parameter in situ if the value has changed from the original.
266 if(pi.value != newValue)
267 { params.set(origParamIndex, ScorerParamInteger.createScorerParamInteger(pi.min, pi.def, pi.max, pi.delta, pi.biasedLow, pi.name, newValue)); }
268 continue;
269 }
270 assert(false); // Unexpected parameter type has been varied.
271 }
272
273 // Now create the full Scorer name with parameters.
274 return(initialScorer.createVariant(initialScorer.getBaseName(), params));
275 }
276
277 // /**If true then a minimiser run that finds a new/'best' Scorer is allowed to run to completion.
278 // * This may make worker threads slow to die, but they should be daemons and interruptable anyway.
279 // */
280 // private static final boolean ALLOW_MIN_RUN_TO_COMPLETION_AFTER_BEST = true;
281
282 /**Scorer 'source' for the "minimiser" method; never empty nor null. */
283 public static final String SSOURCE_MIN = "MIN";
284
285 /**Create a new Scorer (parameter set) from existing ones using a multi-variable "minimiser".
286 * This selects a "good" Scorer and then attempts to find a better/optimal one
287 * derived from it using mathematical error-minimisation techniques.
288 * <p>
289 * For the moment this only attempts to adjust the continuously-variable properties
290 * (ie the integer parameter values) and leaves the enumeration parameters alone.
291 *
292 * @param population the extant population; never null
293 * @param calibSubset if non-null, should be non-empty and is a calibration set
294 * to optimise/minimise against
295 * @param endTime target end time to stop computations at
296 *
297 * @return true iff we created and evaluated a new Scorer not in the extant population
298 */
299 public static boolean createNewByOpt(final SimpleExhibitPipelineIF dataSource,
300 final ScorerCacheIF cache,
301 final ScorerPopulation population,
302 final Map<Name.ExhibitShort, ScoreAndConf> calibSubset,
303 final SimpleLoggerIF log,
304 final long endTime)
305 throws IOException
306 {
307 if((dataSource == null) ||
308 (cache == null) ||
309 (population == null) ||
310 (log == null))
311 { throw new IllegalArgumentException(); }
312
313 final long startTime = System.currentTimeMillis();
314 // We'll extend our finish time if we made progress during the original time slice.
315 final long extendedEndTime = endTime + Math.max(100, 2*(endTime-startTime));
316
317 // Pick breeding set to work on.
318 // Allow slightly stale values for speed and robustness.
319 final List<String> breedingSet = pickBreedingSet(cache, population, true);
320 if(breedingSet == null) { return(false); /* No suitable breeding stock... */ }
321
322 // Are we optimising for error against a small calibration subset?
323 final boolean useCalibSubset = (calibSubset != null) && !calibSubset.isEmpty();
324 // This value is guaranteed to be non-null and non-empty if useCalibSubset is true.
325 final Map<Name.ExhibitShort, ScoreAndConf> calib = (!useCalibSubset) ? null : calibSubset;
326 assert((!useCalibSubset) || ((calib != null) && (!calib.isEmpty()))) : "calib must be null or non-empty";
327
328 final AllExhibitImmutableData aeid = dataSource.getAllExhibitImmutableData(-1);
329
330 // Some of the time, fix some of the otherwise-variable parameters.
331 // we do this to allow the minimiser to focus its fire,
332 // have less chance of trying to adjust interacting parameters, etc.
333 // Only effective when the Scorer has 2 or more (continuously variable) parameters.
334 final boolean fixSomeParameters = Rnd.fastRnd.nextBoolean();
335
336 // If we're fixing some parameters then we can optionally fix all those
337 // at or very close to their end stops.
338 final boolean fixAllExtremeParameters = fixSomeParameters && Rnd.goodRnd.nextBoolean();
339
340 // Some of the time we perturb the initial Scorer we select from the breeding set.
341 // We do this to try to move out of an existing local minimum,
342 // especially as some perturbations may be quite large.
343 final boolean perturbInitialScorer = Rnd.goodRnd.nextBoolean();
344
345 // Randomly select an extant Scorer,
346 // with selection skewed towards "better" Scorers from the available breeding set.
347 final String extant0 = breedingSet.get(
348 Rnd.goodRnd.nextInt(1 + Rnd.goodRnd.nextInt(breedingSet.size())));
349 // Create the Scorer instance with its parameters parsed.
350 final ScorerIF initialScorer0 = cache.getScorerInstance(extant0);
351 assert(initialScorer0 != null) : "must be able to recreate Scorer instance from breeding set SNP: "+extant0; // Must be able to create Scorer from value in extant population.
352 // Note the initial score/goodness.
353 final ScoreAndConf sacExtant0 = cache.computeScorerWeighting(extant0, true, null);
354 // Some of the time, perturb the parameters before we start.
355 // This may help get us out of any local minimum, ie a rut...
356 final String extant;
357 final ScorerIF initialScorer;
358 final ScoreAndConf sacExtant;
359 if(perturbInitialScorer)
360 {
361 // Use perturbed version of extant Scorer.
362 initialScorer = initialScorer0.createPerturbedVariant();
363 extant = initialScorer0.getNameAndParameters();
364 // extant = initialScorer0.getPerturbedNameAndParameters();
365 // initialScorer = cache.getScorerInstance(extant);
366 assert(initialScorer != null) : "must be able to recreate Scorer instance from perturbed SNP: "+extant; // Must be able to create instance for this Scorer variation.
367 sacExtant = cache.computeScorerWeighting(extant, true, SSOURCE_GA); // Simple mutation...
368 }
369 else
370 {
371 // Use selected extant Scorer as-is.
372 initialScorer = initialScorer0;
373 extant = extant0;
374 sacExtant = sacExtant0;
375 }
376 // Compute the goodness of our initial/starting Scorer.
377 final int goodnessExtant = ScoreAndConf.computeScorerGoodness(sacExtant);
378 //if(IsDebug.isDebug) { System.out.println("[ScorerCreator.createNewByOpt(): working on "+extant+"...]"); }
379
380 // Ensure fast in-order and random access to the original parameter definitions.
381 // This assumes that all parameters have unique names.
382 // This must not be altered.
383 final List<ScorerParam> paramDefsAndValuesOriginal = Collections.unmodifiableList(new ArrayList<ScorerParam>(initialScorer.getParameterDefsAndValues()));
384 final int nParams = paramDefsAndValuesOriginal.size();
385
386 // Map from from MnUserParameters params index to paramDefsAndValuesOriginal position.
387 // This makes it easy to simply take a copy of the original
388 // and then overwrite controlled parameter values in place.
389 // We allow this to be over-long...
390 final int mnIndexToParamIndex[] = new int[nParams];
391
392 // // We precompute the expanded versions of all the rest
393 // // to make it relatively quick to assemble a modified parameter set.
394 // final String fixedParams[] = new String[nParams];
395
396 // Create the parameter set,
397 // including only those parameter types with reasonably continuous values,
398 // ie including integer parameters and excluding enumeration parameters.
399 final MnUserParameters params = new MnUserParameters();
400 int mnParamNum = 0;
401 for(int i = 0; i < nParams; ++i)
402 {
403 final ScorerParam p = paramDefsAndValuesOriginal.get(i);
404 assert(p != null) : "parameter cannot be null in paramDefsAndValuesOriginal from "+initialScorer;
405 final String name = p.getName();
406
407 // Allow the minimiser to vary integer-valued parameters.
408 // If we're fixing extreme parameters (which may hinder the minimiser)
409 // then we simply don't vary any parameter within a delta of its limits.
410 if(p instanceof ScorerParamInteger)
411 {
412 final ScorerParamInteger pi = (ScorerParamInteger) p;
413 // Create this as a variable parameter, if:
414 // * not fixing extreme-valued parameters, OR
415 // * not within a delta of the extremes (all but AT the extremes for delta==1).
416 if((!fixAllExtremeParameters) ||
417 ((pi.value - pi.min >= pi.delta) && (pi.max - pi.value >= pi.delta)))
418 {
419 // We set the error tolerance similar the "delta" value
420 // since it is assumed to represent a "small" non-critical change.
421 // We use a minimum error value of 1 to reflect the int granularity.
422 params.add(name, pi.value, (pi.delta < 2) ? 1 : pi.delta, pi.min, pi.max);
423 mnIndexToParamIndex[mnParamNum++] = i; // Provide mapping.
424 continue;
425 }
426 }
427
428 // // For a parameter that cannot be varied (reasonably) smoothly,
429 // // precompute its text representation for speed later.
430 // fixedParams[i] = p.toNameValueString();
431 }
432
433 // If we are fixing some otherwise-variable parameters, we do it here.
434 // We do this only if there is more than one variable parameter.
435 final int variableParamCount = mnParamNum;
436 assert(params.variableParameters() == variableParamCount) : "params.variableParameters() == variableParamCount";
437 if(fixSomeParameters && (variableParamCount > 1))
438 {
439 // We fix at least one of, but never all of, the parameters.
440 // We skew towards leaving few parameters variable.
441 final int parametersToFix = variableParamCount - 1 -
442 Rnd.goodRnd.nextInt(1+Rnd.goodRnd.nextInt(variableParamCount-1));
443 assert((parametersToFix > 0) && (parametersToFix < variableParamCount)) : "must not fix all params nor none";
444 //if(IsDebug.isDebug) { log.log("[Fixing "+parametersToFix+" parameter(s) out of "+variableParamCount+"...]"); }
445 // Collection of indices of parameters to fix.
446 final List<Integer> toFix = new ArrayList<Integer>(variableParamCount);
447 for(int i = variableParamCount; --i >= 0; ) { toFix.add(i); }
448 Collections.shuffle(toFix, Rnd.goodRnd);
449 for(int i = parametersToFix; --i >= 0; )
450 { params.fix(toFix.get(i)); }
451 assert(params.variableParameters() == variableParamCount - parametersToFix) : "params.variableParameters() == variableParamCount - parametersToFix";
452 }
453 //if(IsDebug.isDebug) { log.log("[Minimiser: varying parameters "+params.variableParameters()+" out of "+nParams+"...]"); }
454
455 // Local error-signalling class
456 // (to wrap a legit IOException and propagate up through the minimiser).
457 @SuppressWarnings("serial")
458 class WrappedIOException extends RuntimeException
459 { WrappedIOException(final IOException cause) { super(cause); } }
460
461 // Set true if we find a new Scorer.
462 final AtomicBoolean foundNewScorer = new AtomicBoolean();
463
464 // Set true if we find a Scorer better than our initial one.
465 final AtomicBoolean foundBetterNewScorer = new AtomicBoolean();
466 //
467 // // Set true if we found the best-so-far Scorer at least once during this run.
468 // final AtomicBoolean foundBest = new AtomicBoolean();
469
470 // Maximum error that we can generate in the evaluation functor.
471 final int maxERR = ScoreAndConf.MAX_GOODNESS;
472 assert(maxERR > 0) : "maxERR must be strictly positive as an int";
473
474 // Wrap our Scorer-evaluation routine as a standard functor.
475 // We remember all values so far computed on this evaluation
476 // for speed and consistency.
477 // We are prepared to abort the calculation once:
478 // * we have passed our target end time, and,
479 // * we have found a new Scorer parameter set better than our starting set.
480 // Better means BOTH higher "goodness" AND higher correlation.
481 final FCNBase fcn = (new FCNBase() {
482 /**Map of values so-far requested/computed (thread-safe). */
483 private final ConcurrentMap<String,Integer> answers = new ConcurrentHashMap<String, Integer>();
484 /**Return the "error" of the Scorer, to be minimised. */
485 public double valueOf(final double[] par)
486 {
487 final ScorerIF toEvalSc = constructModifiedSNP(par, initialScorer, paramDefsAndValuesOriginal, mnIndexToParamIndex);
488 final String toEvalS = toEvalSc.getNameAndParameters();
489 // After rounding we may find that we have a previous answer to hand
490 // if so, use it for consistency and speed.
491 final Integer cachedAnswer = answers.get(toEvalS);
492 if(cachedAnswer != null) { return(cachedAnswer); }
493 // Find out if this Scorer is novel/new.
494 final boolean isNew = (population.getScorerWeighting(toEvalSc, true) == null);
495 try
496 {
497 int goodness = Integer.MIN_VALUE;
498 // We can stop if we've found a new Scorer better than we started with.
499 // This helps limit the execution time while still ensuring progress.
500 if((!useCalibSubset) || isNew)
501 {
502 // Allow use of stale/cached values here for speed and robustness
503 // with the risk that an inconsistent mix of stale and up-to-date values
504 // may make finding the minimum error difficult or impossible at times.
505 final ScoreAndConf sac = cache.computeScorerWeighting(toEvalSc, true, SSOURCE_MIN);
506 goodness = ScoreAndConf.computeScorerGoodness(sac);
507
508 // Check that new Scorer is good enough to remain in the general population.
509 if(isNew && (population.getScorerWeighting(toEvalSc, false) != null))
510 {
511 // Note if we generated a useful novel Scorer
512 // (ie good enough to have the overall routine return true).
513 foundNewScorer.set(true);
514 // If our new Scorer is better than we started with then this is progress.
515 // We have a strict notion of "better" which is both the "goodness" measure
516 // AND the underlying accuracy (which is more than we are minimising for).
517 if((goodness > goodnessExtant) && (sac.score > sacExtant.score))
518 { foundBetterNewScorer.set(true); }
519 }
520 }
521
522 // Check if we need to abort the minimiser because we've run out of time.
523 // If we haven't found anything better than we started with and are now out of time, abort.
524 if(System.currentTimeMillis() > (foundBetterNewScorer.get() ? extendedEndTime : endTime))
525 {
526 //if(IsDebug.isDebug) { System.out.println("[ScorerCreator.createNewByOpt(): aborting minimiser having run out of time.]"); }
527 throw new InterruptedIOException("aborting minimiser: out of time");
528 }
529 // If this thread has been interrupted, abort immediately.
530 if(Thread.interrupted())
531 { throw new InterruptedIOException("thread interrupted"); }
532
533 // If using the calibration set then use its measure of goodness instead for the error.
534 // Convert score*confidence into (non-positive) error value (to be minimised).
535 // The better the score, the lower the error.
536 // MAX^2 s.c => 0 error; -MAX^2 s.c => maxERR +ve error.
537 // We make sure that we use the same metric that ScorerPopulation does.
538 final int error = maxERR - (((!useCalibSubset) ? goodness :
539 ScoreAndConf.computeScorerGoodness(ScorerCreator.computeWeighting(calib, cache, toEvalSc, aeid, true).first)));
540 //if(IsDebug.isDebug) { System.out.println("[ScorerCreator.createNewByOpt(): error="+(error)+" having evaluated "+toEval+"...]"); }
541 assert((error >= 0) && (error <= 2*maxERR)) : ("Error expected to be in range [0,"+(2*maxERR)+"] but was "+error) ;
542 answers.putIfAbsent(toEvalS, error); // Cache this answer.
543 return(error);
544 }
545 catch(final IOException e)
546 {
547 // Wrap the original exception and rethrow (as our unchecked exception)
548 // which will abort the minimisation relatively gracefully.
549 throw new WrappedIOException(e);
550 }
551 }
552 });
553
554 // Select the general strategy.
555 // We select the strategy randomly to maximise our chance of success.
556 // Skew strategy selection heavily towards "fast" (0) rather than "safe" (2),
557 // since this will be a rather hard function to handle anyway.
558 final MnStrategy strategy = new MnStrategy(Rnd.fastRnd.nextInt(1+Rnd.fastRnd.nextInt(1+Rnd.fastRnd.nextInt(3))));
559
560 // Create the minimiser instance...
561 final MnMinimize minimiser = new MnMinimize(fcn, params, strategy.strategy());
562 // Warn the minimiser that we don't in practice have much intermediate calculation precision.
563 minimiser.setPrecision(1.5/ScoreAndConf.MAX); // Approx limit of precision in score, for example.
564 // Limit the number of function calls that can be made (as they may be very slow)
565 // and we may not have space to retain all the useful intermediate results.
566 // Allow more calls with more parameters.
567 final int maxFnCalls = Math.min(1024 * nParams, population.maxSize()/2); // Crude limits...
568 // Specify a tolerance >> than the integer granularity (1) of the error value.
569 final int resultTolerance = maxERR/100; // Getting within 1% of optimum is just fine!
570 // ...and attempt to find the minimum.
571 // Note that we don't mind too much if the minimiser doesn't think that it finished,
572 // since it may have gotten us some way in the right direction anyway...
573 // final long minimiserStartTime = System.currentTimeMillis();
574 try
575 {
576 final FunctionMinimum minimum = minimiser.minimize(maxFnCalls, resultTolerance);
577 assert(minimum != null) : "function minimum should be non-null";
578 // Don't try to use the function minimum at all if invalid.
579 if(!minimum.isValid()) { return(foundNewScorer.get()); }
580
581 final double par[] = minimiser.params();
582 assert(par != null) : "minimiser.params() should not be null";
583 assert(fixSomeParameters || (par.length == minimum.userParameters().variableParameters())) : "unless fixing parameters, par.length should == minimum.userParameters().variableParameters()";
584 // This is the reconstructed "best" Scorer found so far.
585 final ScorerIF newVariant = constructModifiedSNP(par, initialScorer, paramDefsAndValuesOriginal, mnIndexToParamIndex);
586
587 // Check that this Scorer name-and-parameters value is new.
588 // Note that evaluating the scorer internally may already have cached a result,
589 // so we regard finding a new value as successful if the minimisation succeeded,
590 // or if we had made some progress en passant...
591 if(population.getScorerWeighting(newVariant, true) != null)
592 {
593 return(foundNewScorer.get() ||
594 (minimum.isValid() && !newVariant.getParameterDefsAndValues().equals(extant) &&
595 !newVariant.getParameterDefsAndValues().equals(extant0)));
596 }
597
598 // Evaluate new variant, automatically recording/sharing if good.
599 // Force calculation using fully up-to-date values.
600 /* final ScoreAndConf sac = */ cache.computeScorerWeighting(newVariant, false, SSOURCE_MIN);
601 return(true); // Successfully created and evaluated new creature.
602 }
603 catch(final IOException e) { throw e; /* Propagate I/O problem as-is. */ }
604 catch(final WrappedIOException e)
605 {
606 if(foundNewScorer.get()) { return(true); /* Made progress anyway. */ }
607 // Didn't make progress, so rethrow the exception.
608 if(e.getCause() == null) { throw new IOException("WrappedIOException had null cause"); }
609 throw (IOException) e.getCause(); /* Unwrap and rethrow underlying IOException. */
610 }
611 // finally
612 // {
613 // if(foundGoodEnoughNewScorer.get())
614 // {
615 // final long runTime = System.currentTimeMillis() - startTime;
616 ///* if(IsDebug.isDebug) */ { log.log("[Minimiser: made progress in "+(runTime)+"ms: varied parameters "+params.variableParameters()+" out of "+nParams+"...]"); }
617 // }
618 // }
619 }
620
621
622 /**Class to encapsulate all background and evolution work for a given ScorerCache.
623 * The doChunk() entry point is designed to be thread-safe and highly threadable,
624 * ie can be called from multiple worker threads at once.
625 */
626 public static final class ScorerWork
627 {
628 /**Create instance.
629 * @param scorerCache cache of Scorers to work with; never null
630 * @param log logger; never null
631 * @param allowScorerSharingByEvent if true,
632 * then we try to retrieve persisted/shared Scorers
633 * from the system variables event mechanism
634 */
635 public ScorerWork(final AbstractScorerCache scorerCache,
636 final SimpleLoggerIF log,
637 final boolean allowScorerSharingByEvent)
638 {
639 if((scorerCache == null) || (log == null))
640 { throw new IllegalArgumentException(); }
641 this.scorerCache = scorerCache;
642 this.log = log;
643 this.allowScorerSharingByEvent = allowScorerSharingByEvent;
644 }
645
646 /**Underlying Scorer cache; never null. */
647 private final AbstractScorerCache scorerCache;
648
649 /**Logger; never null. */
650 private final SimpleLoggerIF log;
651
652 /**If true, then we try to retrieve persisted/shared Scorers from the system variables event mechanism. */
653 private final boolean allowScorerSharingByEvent;
654
655 /**Excess time spent running slow evolution methods (ns).
656 * In order to attempt to balance the amount of time
657 * spent on different methods of generating new Scorers,
658 * given that they can be orders of magnitude apart in execution time,
659 * we attempt to balance the total run-time they take.
660 * <p>
661 * This is increased by the execution time of each "slow" instance
662 * and decreased by the execution time of each "fast" instance.
663 * Slow methods may only start when it is negative, fast ones otherwise.
664 * <p>
665 * Run-time for "failed" executions may or may not be included.
666 * <p>
667 * This starts at zero, and the effect of the "fair share" scheduling that this controls
668 * should be to keep it as close to zero (on either side of zero) as possible.
669 * <p>
670 * This is thread-safe.
671 */
672 private final AtomicLong excessSlowTimeNs = new AtomicLong();
673
674 /**Run one chunk of work.
675 * Note that if the minTime or minEndTime values are not more than 1ms beyond 'now'
676 * then this avoids computations that are likely to require especially large
677 * amounts of CPU time/effort.
678 *
679 * @param special if true, do extra background work and normal work
680 * @param inboundEval if non-null then is queue of externally-supplied Scorers
681 * to evaluate and insert into the population if appropriate;
682 * these are regarded as potentially untrusted so are validated and canonicalised before use
683 * @param endTime target end time for this chunk of work to stop by
684 * @param minEndTime time before which we should not stop (ie target minimum quantum)
685 */
686 public final void doChunk(final long endTime, final long minEndTime,
687 final boolean special,
688 final Queue<String> inboundEval)
689 {
690 final ScorerPopulation population = scorerCache.getPopulation();
691
692 // We do some extra housekeeping work first in "special" threads
693 // or when we need to bootstrap.
694 if(special || (population.size() == 0))
695 {
696 if(IsDebug.isDebug && (population.size() == 0)) { System.out.println("ScorerCreator.doChunk(): bootstrapping empty population"); }
697 // Make sure that we get base and best Scorers up to date and keep them so.
698 // This will also ensure that all sorts of supporting data is kept fresh too.
699 final ArrayList<String> shuffledBaseNames = new ArrayList<String>(scorerCache.getBaseScorersWithoutParameters());
700 // Getting these 'best' values should not cause any exceptions or delay.
701 // Re-evaluating these should ensure that the best deserve to remain so,
702 // else incrementally push them out of the way to allow replacements.
703 shuffledBaseNames.addAll(scorerCache.getCurrentScorersWithParameters(Rnd.fastRnd.nextBoolean()));
704 // Shuffle the Scorers to give a fair/equal crack of the whip to all-comers.
705 Collections.shuffle(shuffledBaseNames, Rnd.fastRnd);
706 for(final String scorerName : shuffledBaseNames)
707 {
708 if(IsDebug.isDebug && (population.size() == 0)) { System.out.println("ScorerCreator.doChunk(): bootstrapping empty population with: "+scorerName); }
709 // Attempt to bring the selected Scorer weighting fully up-to-date
710 // if not already so.
711 try { scorerCache.computeScorerWeighting(scorerName, Rnd.fastRnd.nextBoolean(), null); }
712 catch(final IOException e) { /* Ignore probably-transient I/O problem. */ }
713 catch(final Exception e)
714 {
715 // Stop on unexpected error.
716 e.printStackTrace();
717 System.err.println("Problem with computeScorerWeighting("+scorerName+", false): " + e.getMessage());
718 }
719 // Stop and exit this run immediately if we've exhausted our time chunk...
720 // This forces us to get the basics in place and up-to-date first.
721 if(System.currentTimeMillis() > endTime) { return; }
722 }
723 }
724
725 // If there is still an empty population then stop this work chunk immediately
726 // since there is nothing to 'evolve' from,
727 // unless we have inbound queued Scorers
728 // or we are permitted to import Scorers from system events.
729 if((population.size() == 0) && (!allowScorerSharingByEvent) &&
730 ((inboundEval == null) || inboundEval.isEmpty()))
731 { return; }
732
733 // Because the optimiser/minimiser method is possibly hundreds of times slower
734 // than the normal GA (select/combine/mutate) method,
735 // we attempt to force some type of fair-share of CPU time post-hoc.
736 // By selecting outside the loop "fast"/"slow",
737 // we reduce the chance of accidentally starting lots of slow methods at once.
738 // We avoid slow methods where our available time quantum seems very small
739 // at least in relation to our ability to measure time accurately.
740 // TODO: We might also avoid slow methods in power-conserving mode,
741 // since otherwise we may uninterruptibly need a large chunk of energy.
742 final boolean doSlowMethods = (excessSlowTimeNs.get() < 0) &&
743 (minEndTime - System.currentTimeMillis() > 1) &&
744 (endTime - System.currentTimeMillis() > 1);
745 // If using slow methods (eg minimisation)
746 // then some of the time optimise/minimise against a small calibration subset for speed.
747 // If we encounter any problem trying to extract the calibration set, we do without.
748 final boolean useCalibSubset = doSlowMethods && Rnd.fastRnd.nextBoolean();
749 Map<Name.ExhibitShort, ScoreAndConf> calibSubset = null;
750 if(useCalibSubset)
751 {
752 // Try to generate a generic (and large-ish if possible) calibration set
753 // (that reflects strengths/weaknesses amongst all available Scorers)
754 // since we don't know at this point which Scorer we will be tuning.
755 try { calibSubset = scorerCache.extractCalibrationSet(null, ScorerCreator.SUGGESTED_CALIB_SET_SIZE, null, true); }
756 catch(final IOException e) { /* Ignore error and do without calibration subset. */ }
757 }
758
759 // Try to create several new/good Scorers values
760 // or to import several good saved Scorers from local or global state.
761 // Do extra rounds if we've nowhere-near filled up our population/gene pool yet
762 // and/or if computation is going very quickly.
763 final int maxRounds = Math.min(1+SystemVariables.EVENT_SAMPLES_RETAINED,
764 Math.max(3, (population.maxSize() / Math.max(1, population.size()))));
765 int roundsDone = 0; // Decremented on successful creation/import of new variant.
766 try
767 {
768 // Continue until we've made made visible progress
769 // or at least until we've being trying for a reasonable time,
770 // but stop as soon as we pass our target end time or we are interrupted.
771 while(((roundsDone < maxRounds) || (System.currentTimeMillis() <= minEndTime)) &&
772 (System.currentTimeMillis() <= endTime) && (!Thread.interrupted()))
773 {
774 // Attempt to create new Scorer variations
775 // or import extant Scorers from local/global event histories or outside.
776 // In each case, these routines should only return true
777 // if fairly sure that they have made useful progress.
778 if(Rnd.fastRnd.nextBoolean()) // Try to 'import' half the time...
779 {
780 if(allowScorerSharingByEvent && Rnd.fastRnd.nextBoolean())
781 {
782 // Try to fetch a new top Scorer from another population
783 // or find a "golden oldie" from local/global history.
784 // Fast.
785 if(ScorerCreator.retrieveOldScorer(scorerCache.getDataSource(), scorerCache, population, log))
786 { ++roundsDone; }
787 }
788 else if(inboundEval != null)
789 {
790 final String next = inboundEval.poll(); // Non-blocking read.
791 // Count this as successful 'round'
792 // if there is a new inbound Scorer whose canonicalised form is new
793 // and yields a positive goodness.
794 if(next != null)
795 {
796 try
797 {
798 final ScorerIF sc = scorerCache.getScorerInstance(next);
799 if(sc != null)
800 {
801 final String canon = AbstractScorer.canonicalise(sc);
802 if(population.getScorerWeighting(canon, true) == null)
803 {
804 final ScoreAndConf newSAC = scorerCache.computeScorerWeighting(canon, false, "inbound");
805 if(ScoreAndConf.computeScorerGoodness(newSAC) > 0)
806 { ++roundsDone; }
807 }
808 }
809 }
810 catch(final Exception e)
811 { /* Ignore errors from potentially-untrusted inbound Scorer values. */ }
812 }
813 }
814 }
815 else // Try to 'create' half the time.
816 {
817 final long startTime = System.nanoTime();
818 if(!doSlowMethods)
819 {
820 try
821 {
822 // Incremental (quick) creation of new Scorer by GA methods:
823 // selection, combination, mutation.
824 // Fast.
825 if(ScorerCreator.createNewByGA(scorerCache, population, log))
826 { ++roundsDone; }
827 }
828 finally { excessSlowTimeNs.addAndGet(startTime - System.nanoTime()); /* Subtract run time. */ }
829 }
830 else
831 {
832 try
833 {
834 // Potentially slow minimiser-based method, seeking "optimum" parameters.
835 // Can probably execute only once on each run.
836 // Each run may generate many novel Scorer values, however.
837 if(ScorerCreator.createNewByOpt(scorerCache.getDataSource(), scorerCache, population, calibSubset, log, endTime))
838 { ++roundsDone; }
839 }
840 finally { excessSlowTimeNs.addAndGet(System.nanoTime() - startTime); /* Add run time. */ }
841 }
842 }
843
844 // If memory seems to be under huge strain
845 // or the system is conserving power
846 // then abort once we have apparently made some progress
847 // (to avoid system/memory stress and starvation to no end).
848 if((roundsDone > 0) &&
849 (MemoryTools.isMemoryStressed() || GenUtils.mustConservePower()))
850 { return; }
851 }
852 }
853 catch(final IOException e) { /* Stop this run on probably-transient I/O problem. */ }
854 catch(final Throwable t) { t.printStackTrace(); /* Stop this run on serious error. */ }
855 }
856 }
857
858
859 /**Select at random a best-first breeding set of one Scorer base type; null if none available.
860 * Returns the non-empty breeding set of a parameterised Scorer selected at random,
861 * or null if no such breeding set is available.
862 * <p>
863 * We weight this more heavily towards breeding sets whose best constituent has a high "goodness".
864 *
865 * @param population the extant population; never null
866 *
867 * @return null or in-order (best-first) non-empty breeding set of same Scorer type
868 */
869 private static List<String> pickBreedingSet(final ScorerCacheIF cache,
870 final ScorerPopulation population,
871 final boolean allowStale)
872 {
873 // Select at random one of the top or base Scorer names,
874 // though avoiding any that have no parameters or an empty breeding set.
875 // We ensure that the base Scorers are included to allow bootstrapping.
876 // If we have more entries for the bigger populations then than is a useful skew.
877 final List<String> topScorers = new ArrayList<String>(population.getCurrentScorersWithParameters(allowStale));
878 topScorers.addAll(cache.getBaseScorersWithoutParameters());
879
880 // Remove all clearly-ineligible Scorers,
881 // ie those without parameters.
882 for(final Iterator<String> it = topScorers.iterator(); it.hasNext(); )
883 {
884 final ScorerIF sc = cache.getScorerInstance(it.next());
885 final Collection<ScorerParam> parameterDefs = sc.getParameterDefsAndValues();
886 if(parameterDefs.isEmpty()) { it.remove(); } // No variation possible.
887 }
888 // Return null if no candidates left.
889 final int tsSize = topScorers.size();
890 if(tsSize == 0) { return(null); }
891
892 // Sort remainder by (decreasing) goodness, ie best-first.
893 Collections.sort(topScorers, new Comparator<String>() {
894 public int compare(final String o1, final String o2)
895 {
896 final ScoreAndConf sw1 = population.getScorerWeighting(o1, allowStale);
897 final int g1 = (sw1 != null) ? ScoreAndConf.computeScorerGoodness(sw1) : Integer.MIN_VALUE;
898 final ScoreAndConf sw2 = population.getScorerWeighting(o2, allowStale);
899 final int g2 = (sw2 != null) ? ScoreAndConf.computeScorerGoodness(sw2) : Integer.MIN_VALUE;
900 if(g1 < g2) { return(-1); }
901 if(g1 > g2) { return(+1); }
902 return(0); // Identical.
903 }
904 });
905
906 // Try to use breeding set skewed towards earlier (better) values.
907 // If our first choice is not available, work down the list (to less-good Scorers).
908 final int startIndex = Rnd.goodRnd.nextInt(1+Rnd.goodRnd.nextInt(tsSize));
909 for(int i = startIndex; i < tsSize; ++i)
910 {
911 final ScorerIF sc = cache.getScorerInstance(topScorers.get(i));
912 // final Collection<ScorerParam> parameterDefs = sc.getParameterDefsAndValues();
913 // if(parameterDefs.isEmpty()) { continue; } // No variation possible.
914 final List<String> bs = population.getBreedingSet(sc.getBaseName());
915 if(!bs.isEmpty()) { return(bs); }
916 }
917
918 // Fallback...
919 // If we could not get a breeding set with our preferred skewed weighting
920 // then shuffle the names randomly and return the first suitable entry.
921 Collections.shuffle(topScorers, Rnd.fastRnd);
922 for(final String snp : topScorers)
923 {
924 final ScorerIF sc = cache.getScorerInstance(snp);
925 // final Collection<ScorerParam> parameterDefs = sc.getParameterDefsAndValues();
926 // if(parameterDefs.isEmpty()) { continue; } // No variation possible.
927 final List<String> bs = population.getBreedingSet(sc.getBaseName());
928 if(!bs.isEmpty()) { return(bs); }
929 }
930
931 // There really was no suitable breeding sub-population at all.
932 return(null);
933 }
934
935 /**If true then monitor and report performance (CPU usage) in computeWeighting().
936 * Turn this on for (Scorer) performance tuning only...
937 * <p>
938 * This should give a reasonable view of the most compute-intensive Scorer workload.
939 */
940 private static final boolean PERF_SAMPLE_CW = false;
941
942 /**Cumulative performance stats (if any) for computeWeighting(); null iff none being collected. */
943 private static final ConcurrentHashMap<StackTraceElement, AtomicInteger> perfCountsCW = (!PERF_SAMPLE_CW) ? null : new ConcurrentHashMap<StackTraceElement, AtomicInteger>(1001);
944
945 /**Computes ScoreAndConf over the supplied calibration exhibits using the supplied Scorer cache; never null but may be (0,0) where the scorer is unknown or untested.
946 * This can be used as part of the computation of a Scorer's weighting.
947 * The input data must not change while this routine is running.
948 *
949 * @param calibrationData map from short exhibit names to external calibration data
950 * (eg actual user vote-based score); never null
951 *
952 * @return the score (first in the Pair) represents the correlation with the underlying votes
953 * (and whatever the scoring is measured against)
954 * with MAX meaning perfect correlation, 0 meaning no correlation,
955 * and -MAX meaning perfectly wrong answers all the time,
956 * and the confidence 0 if we have no (or very/too few) data points
957 * and approaching MAX as we have a large (enough) number of data points;
958 * the second element of the Pair is true if the the value is "partial"
959 * (we had to prematurely stop calculation or did not have enough data points)
960 * and so for example should not be cached
961 */
962 public static Tuple.Pair<ScoreAndConf,Boolean> computeWeighting(
963 final Map<Name.ExhibitShort, ScoreAndConf> calibrationData,
964 final ScorerCacheIF scorerCache,
965 final ScorerIF scorer,
966 final AllExhibitImmutableData aeid,
967 final boolean allowStale)
968 throws IOException
969 {
970 if((calibrationData == null) ||
971 (scorerCache == null) ||
972 (aeid == null) ||
973 (scorer == null))
974 { throw new IllegalArgumentException(); }
975
976 // Note start time so as to be able to limit effort/time spent in this call.
977 final long startTime = System.currentTimeMillis();
978
979 // If there is no calibration data then we cannot compute (nor cache) a result.
980 if(calibrationData.isEmpty()) { return(new Tuple.Pair<ScoreAndConf, Boolean>(ScoreAndConf.NO_OPINION, true)); }
981
982 // If there are no exhibits then we cannot compute (nor cache) a result.
983 if(aeid.length == 0) { return(new Tuple.Pair<ScoreAndConf, Boolean>(ScoreAndConf.NO_OPINION, true)); }
984
985 // Start performance sampling/monitoring, if any.
986 Thread perfMonitorThread = null;
987 if(PERF_SAMPLE_CW)
988 {
989 perfMonitorThread = GenUtils.startThreadPerfMonitor(
990 Thread.currentThread(),
991 perfCountsCW,
992 null, // "org.hd.d.pg2k.ai.",
993 "org.hd.d.pg2k.", // Monitor thread for at most 10s.
994 System.currentTimeMillis() + 10000, 10); // Sample often enough to obtain a decent profile.
995 }
996 try
997 {
998 // Last captured error, and error count, if any.
999 IOException err = null;
1000 int errorCount = 0;
1001 // If we didn't finish then we should not cache the result.
1002 boolean stoppedPrematurely = false;
1003 // Compute map from short name to score computed by this Scorer.
1004 final Map<Name.ExhibitShort, ScoreAndConf> scorerResults = new HashMap<Name.ExhibitShort, ScoreAndConf>(
1005 2 * calibrationData.size());
1006 // Randomise the order in which we do the computations
1007 // to better queue background work, avoid systemic and order-dependent errors, etc.
1008 final List<Name.ExhibitShort> names = new ArrayList<Name.ExhibitShort>(calibrationData.keySet());
1009 if(Rnd.fastRnd.nextBoolean())
1010 { Collections.shuffle(names, Rnd.fastRnd); }
1011 for(final Name.ExhibitShort shortName : names)
1012 {
1013 // If the exhibit is missing (eg has been deleted or renamed) then skip it.
1014 final Name.ExhibitFull fullName = aeid.getFullName(shortName);
1015 if(fullName == null) { continue; }
1016
1017 try
1018 {
1019 // Compute raw Scorer result for this exhibit.
1020 // Note that this may quickly return zero confidence
1021 // therefore meaning we can skip further work for this exhibit
1022 // since this Scorer may not be suitable for this exhibit type.
1023 final ScoreAndConf sac = scorerCache
1024 .computeUnweightedScoreAndConfidence(fullName,
1025 scorer, allowStale);
1026 // If confidence is zero then skip this exhibit.
1027 if(sac.confidence == 0) { continue; }
1028
1029 // Record this non-zero-confidence result.
1030 scorerResults.put(shortName, sac);
1031 }
1032 catch(final IOException e)
1033 {
1034 // Capture the error.
1035 ++errorCount;
1036 err = e;
1037
1038 // We are prepared to skip over a few missing data points
1039 // (ie ignore a small number of possibly-transient errors)
1040 // though then do not subsequently cache the result
1041 // unless there has been only a very small fraction of errors.
1042 // This may also allow requests (eg for thumbnails/events)
1043 // to be queued in the background ready for next time
1044 // while making computation generally more robust.
1045 final int usableExhibits = scorerResults.size();
1046 if((errorCount <= usableExhibits)
1047 || (System.currentTimeMillis() - startTime < CoreConsts.MAX_INTERACTIVE_DELAY_MS))
1048 { continue; }
1049
1050 // We've spent long enough: do calculations based on what we have so far...
1051 // If we've had to give up and we don't allow stale/partial results then abort...
1052 if(!allowStale) { throw e; }
1053 // ...else continue with what we have but don't cache it.
1054 stoppedPrematurely = true;
1055 break;
1056 }
1057 }
1058 // We will accept the result if there were far more data points
1059 // where we had a usable vote factor AND a non-zero Scorer confidence
1060 // than the count of errors that we hit (and points we should have included)
1061 // OR if there were some exhibits but none at all that we could use with this Scorer.
1062 final int usableExhibits = scorerResults.size();
1063 assert (aeid.length > 0);
1064 // We will accept/trust the result if there were far more data points
1065 // where we had a usable vote factor AND a non-zero Scorer confidence
1066 // than the count of errors that we hit (and points we should have included)
1067 // OR if there were some exhibits but none at all that we could use with this Scorer.
1068 // This value gets filled in after attempting to calibrate.
1069 final boolean enoughDataPoints = (errorCount <= usableExhibits
1070 / ScorerCreator.MIN_SUGGESTED_CALIB_SET_SIZE);
1071 // If not allowing stale/partial results then we must abort after too many errors.
1072 if(!allowStale && !enoughDataPoints) { throw err; }
1073 return(new Tuple.Pair<ScoreAndConf, Boolean>(
1074 ScorerCreator.computeWeighting(scorerResults, calibrationData),
1075 stoppedPrematurely || !enoughDataPoints));
1076 }
1077 finally
1078 {
1079 if(PERF_SAMPLE_CW && (null != perfMonitorThread))
1080 {
1081 try
1082 {
1083 GenUtils.stopPerfMonitorandDumpSamples(perfMonitorThread,
1084 "computeWeighting()",
1085 perfCountsCW,
1086 null,
1087 20, GenUtils.systemOutLogger); // Log to system out...
1088 }
1089 catch(final InterruptedException e)
1090 {
1091 Thread.currentThread().interrupt(); // Don't lose interruptedness...
1092 }
1093 }
1094 }
1095 }
1096
1097 /**Computes ScoreAndConf over the supplied calibration exhibits; never null but may be (0,0) where the scorer is unknown or untested.
1098 * This can be used as part of the computation of a Scorer's weighting.
1099 * <p>
1100 * All keys in the calibration data should map to non-null values in the ratings map
1101 * (ie the ratings keys can be a superset of the calibration keys);
1102 * missing values will diminish the confidence of the result.
1103 * <p>
1104 * The input data must not change while this routine is running.
1105 *
1106 * @param scorerResult map from short exhibit name to Scorer's rating/result for the exhibit;
1107 * never null
1108 * @param calibrationData map from short exhibit name to external calibration data
1109 * (eg actual user vote-based score); never null
1110 *
1111 * @return the score represents the correlation with the underlying votes
1112 * (and whatever the scoring is measured against)
1113 * with MAX meaning perfect correlation, 0 meaning no correlation,
1114 * and -MAX meaning perfectly wrong answers all the time,
1115 * and the confidence 0 if we have no (or very/too few) data points
1116 * and approaching MAX as we have a large (enough) number of data points
1117 */
1118 public static ScoreAndConf computeWeighting(final Map<Name.ExhibitShort, ScoreAndConf> scorerResult,
1119 final Map<Name.ExhibitShort, ScoreAndConf> calibrationData)
1120 {
1121 // If there are no results or no calibration data then we cannot generate a confident answer.
1122 if(scorerResult.isEmpty() || calibrationData.isEmpty())
1123 { return(ScoreAndConf.NO_OPINION); }
1124
1125 // Exhibits which were usable to calibrate this Scorer.
1126 int usableExhibits = 0;
1127 // The total squared error (and confidence) in the range [0, (2*MAX)^2].
1128 // Each error term is multiplied by (moderated downwards by)
1129 // the product of the Scorer and vote-Factor confidence/correlation.
1130 assert((long) ScoreAndConf.MAX * (long) ScoreAndConf.MAX <= Integer.MAX_VALUE);
1131 long cumError = 0;
1132 // The dot product of the Scorer confidence and the vote confidence,
1133 // with theoretical range [0,Y] where Y = ScoreAndConf.MAX^2 * usableExhibits.
1134 long confDP = 0;
1135 for(final Name.ExhibitShort shortName : calibrationData.keySet())
1136 {
1137 // Get vote-based rating (cannot be null or have non-positive correlation).
1138 final ScoreAndConf vote = calibrationData.get(shortName);
1139 // Vote correlation must be strictly positive to be usable.
1140 if(vote.confidence == 0) { continue; }
1141
1142 // Compute raw Scorer result for this exhibit.
1143 // Note that this may quickly return zero confidence
1144 // thus meaning that we can skip further work for this exhibit
1145 // since this Scorer may not be suitable for this exhibit type.
1146 final ScoreAndConf sac = scorerResult.get(shortName);
1147 // If result is missing then sjip this exhibit.
1148 if(sac == null) { continue; }
1149 // If confidence is zero then skip this exhibit.
1150 if(sac.confidence == 0) { continue; }
1151
1152 final int conf = sac.confidence * vote.confidence;
1153 assert(conf >= 0); // Should be no overflow...
1154 // Skip zero or noisy (very-low-confidence) correlations.
1155 if(conf < 1) { continue; }
1156 confDP += conf;
1157 // Compute raw error (difference) in range [0,2*MAX].
1158 // The Scorer result is an actual predicted score for the exhibit.
1159 // The smaller the difference between this predicted score
1160 // and the human-vote-derived score the better.
1161 final int rawError = sac.score - vote.score;
1162 // Compute squared error times the (normalised to [0.0,1.0]) confidence.
1163 final long sqErrConf = ((rawError * (long)rawError) * conf) / (ScoreAndConf.MAX * ScoreAndConf.MAX);
1164 //if(IsDebug.isDebug) { System.out.println("[Error/diff="+error+" @ conf="+conf+" for "+scorerNameAndParameters+" on "+shortName+"; sac="+sac+"; voteFactor="+voteFactor+".]"); }
1165 assert(sqErrConf >= 0);
1166 cumError += sqErrConf;
1167 assert(cumError >= 0); // FIXME: Check that we never overflow.
1168 ++usableExhibits;
1169 }
1170
1171 ScoreAndConf result = ScoreAndConf.NO_OPINION; // Scorer unusable unless proven otherwise.
1172 // We can't compute a result with insufficient cumulative confidence.
1173 // We can cache a valid negative result though.
1174 if((usableExhibits > 0) && (confDP > 0))
1175 {
1176 final double svRangeConf = 1.0;
1177
1178 // Verify that accumulators are in their expected ranges.
1179 assert((confDP >= 0) && (confDP <= (usableExhibits * (long)ScoreAndConf.MAX*ScoreAndConf.MAX)));
1180 assert((cumError >= 0) && (cumError <= (usableExhibits * 4 * (long)ScoreAndConf.MAX*ScoreAndConf.MAX)));
1181
1182 // Compute RMS error in range [0,2*MAX].
1183 final double errorRMSRaw = Math.sqrt(cumError / (confDP / (double)(ScoreAndConf.MAX*ScoreAndConf.MAX)));
1184 assert((errorRMSRaw > -1) && (errorRMSRaw < 1 + 2*ScoreAndConf.MAX)); // We'll allow a little wiggle room for rounding errors...
1185 // Compute score in range +/- ScoreAndConf.MAX.
1186 final long score = ScoreAndConf.MAX - Math.round(errorRMSRaw);
1187 // Compute confidence in range [0,ScoreAndConf.MAX] (plus possible rounding errors).
1188 // We allow extra samples over our minimum desired number
1189 // to compensate for poor per-exhibit confidence.
1190 final long confidence = Math.min(ScoreAndConf.MAX,
1191 Math.round(svRangeConf * (confDP / (usableExhibits*ScoreAndConf.MAX)) *
1192 Math.sqrt(usableExhibits / (double) calibrationData.size())));
1193 result = new ScoreAndConf((short) Math.max(-ScoreAndConf.MAX, Math.min(ScoreAndConf.MAX, score)),
1194 (short) Math.max(0, confidence));
1195 }
1196
1197 return(result);
1198 }
1199
1200 /**Minimum suggested calibration-set size; strictly positive.
1201 * Note that if multiple distinct Scorer populations and target exhibit types are to be covered,
1202 * then a set of this size or greater should be gathered for each of them.
1203 * It may then be convenient to work with the union of those values
1204 * for all Scorer types.
1205 * <p>
1206 * This should be large enough that no one Scorer has enough parameters
1207 * to trivially map each different calibration exhibit to its correct score.
1208 * <p>
1209 * In general, while using a larger calibration sample set increases CPU and memory costs,
1210 * especially for remote 'AH' clients that may need to fetch data,
1211 * a larger calibration set should increase the reliability of calibrated results.
1212 * But there may be a huge advantage in using a set small enough
1213 * to keep in cache close to the top of the memory hierarchy.
1214 * <p>
1215 * We may want to try different calibration set sizes larger than the minimum
1216 * to improve the usefulness and predictive power of the calibration set.
1217 * <p>
1218 * A power of two may allow for generation of slightly better code in various places.
1219 * <p>
1220 * Experience suggests that this value should be at least 128.
1221 */
1222 public static final int MIN_SUGGESTED_CALIB_SET_SIZE = 128;
1223
1224 /**Server-side calibration set size; strictly positive and no less than MIN_SUGGESTED_CALIB_SET_SIZE.
1225 * A larger calibration set of this size implies use of more memory and CPU time,
1226 * but with a higher reliability of answer,
1227 * than using a calibration set of size MIN_SUGGESTED_CALIB_SET_SIZE.
1228 * <p>
1229 * Needs to be at least as large as the minimum 'voted-for' set to give good confidence.
1230 * <p>
1231 * A power of two may allow for generation of slightly better code.
1232 */
1233 public static final int SUGGESTED_CALIB_SET_SIZE =
1234 Math.max(ScorerCacheImpl.MIN_VOTED_FOR_SET_SIZE, 2*MIN_SUGGESTED_CALIB_SET_SIZE);
1235
1236 /**Tunnel root-relative (starting with '/') URL within the server; never null. */
1237 public static final String scorerTunnelRRURL = "/_AI/remote.jsp";
1238 /**Parameter name to request particular remote data type; never null nor empty. */
1239 public static final String DT_PARAM_NAME = "scDataType";
1240 /**Data type parameter value to request calibration-set data; never null nor empty. */
1241 public static final String DT_CALIB = "calibration";
1242 /**Data type parameter value to request server's best Scorers; never null nor empty. */
1243 public static final String DT_BESTSC = "bestScorers";
1244 /**Data type parameter value to POST new 'star' Scorer; never null nor empty. */
1245 public static final String DT_POSTNEWSC = "postNewScorer";
1246 /**Data type parameter name for generic data/query; never null nor empty. */
1247 public static final String DT__GEN_DATA = "q";
1248 }