001 /*
002 Copyright (c) 1996-2011, Damon Hart-Davis
003 All rights reserved.
004
005 Redistribution and use in source and binary forms, with or without
006 modification, are permitted provided that the following conditions are
007 met:
008
009 * Redistributions of source code must retain the above copyright
010 notice, this list of conditions and the following disclaimer.
011
012 * Redistributions in binary form must reproduce the above copyright
013 notice, this list of conditions and the following disclaimer in the
014 documentation and/or other materials provided with the
015 distribution.
016
017 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
018 IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
019 TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
020 PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
021 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
022 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
023 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
024 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
025 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
026 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
027 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
028 */
029
030 package org.hd.d.pg2k.webSvr.util;
031
032 import java.io.IOException;
033 import java.util.Collections;
034 import java.util.HashSet;
035 import java.util.List;
036 import java.util.Set;
037 import java.util.SortedSet;
038 import java.util.concurrent.Callable;
039 import java.util.concurrent.Future;
040
041 import org.hd.d.pg2k.svrCore.AllExhibitImmutableData;
042 import org.hd.d.pg2k.svrCore.ExhibitAttrUtils;
043 import org.hd.d.pg2k.svrCore.ExhibitName;
044 import org.hd.d.pg2k.svrCore.MemoryTools;
045 import org.hd.d.pg2k.svrCore.Name;
046 import org.hd.d.pg2k.svrCore.TextUtils;
047 import org.hd.d.pg2k.svrCore.TextUtils.CharSequence8Bit;
048 import org.hd.d.pg2k.svrCore.ThreadUtils;
049 import org.hd.d.pg2k.webSvr.exhibit.DataSourceBean;
050
051 import ORG.hd.d.IsDebug;
052 import ORG.hd.d.jIndexer.server.JIndexBean;
053
054 /**Bean for cached search results.
055 * Uses SearchUtils.doRelatedExhibitsSearch() to compute results lazily
056 * and caches the results.
057 * <p>
058 * Cache size is limited at most in proportion to the number of exhibits
059 * (keyed from their names and from canonicalised forms of those names for shared results)
060 * and may be capped to a fixed ceiling below that to retain only the most popular lookups.
061 */
062 public final class SearchResultSimpleCache
063 {
064 /**Map from full exhibit name to immutable ordered list of full-name results, best match first.
065 * Thread-safe.
066 * <p>
067 * Capped size to minimise memory footprint while still giving good performance.
068 * Falling through the cache and doing the query from scratch shouldn't be hideously slow anyway.
069 * <p>
070 * Since the actual cached items are small (even compared to Map.Entry overheads),
071 * a relatively large number can be cached
072 * and it is not worth a complex high-overhead scheme, eg using SoftReferences, to manage them.
073 * <p>
074 * Limit maximum cache size in proportion to heap size; strictly positive.
075 * Aim to allow ~10000 entries per 1GB of heap space (at initialisation of this instance),
076 * with a minimum of tens/hundreds to cover popular pages, etc, at relatively low memory cost.
077 * <p>
078 * We're prepared to discard everything under acute memory pressure.
079 */
080 private final MemoryTools.CacheMiniMap<Name.ExhibitFull,List<Name.ExhibitFull>> fromFullName =
081 MemoryTools.SimpleProbabilisticCache.<Name.ExhibitFull,List<Name.ExhibitFull>>create(computeCacheMaxSize(), "fromFullName");
082
083 /**Map from condensed canonical key to immutable ordered list of full-name results, best match first.
084 * Thread-safe.
085 * As fromFullName.
086 */
087 private final MemoryTools.CacheMiniMap<Integer,List<Name.ExhibitFull>> fromCanonKey =
088 MemoryTools.SimpleProbabilisticCache.<Integer,List<Name.ExhibitFull>>create(computeCacheMaxSize(), "fromCanonKey");
089
090 /**Compute maximum cache size, partly based on heap size; strictly positive. */
091 private static int computeCacheMaxSize()
092 { return(Math.max(1<<8, (int) Math.min(1<<16, Runtime.getRuntime().totalMemory() >> 17))); }
093
094 /**Absolute maximum number of results to store/cache per exhibit.
095 * Limits storage requirements,
096 * and is as many as we should ever want to show on one catalogue page.
097 * <p>
098 * If the catalogue page logic actually uses this value as the maximum request size
099 * then we should get optimal CPU and memory use, and minimal heap churn.
100 */
101 public static final int ABS_MAX_RESULTS = WebConsts.MAX_RESULTS_PER_PAGE/2;
102
103 /**Do cached search lookup on given full exhibit name returning full exhibit name results; never null.
104 * If fewer items are requested than computed/cached,
105 * then the best (first) cached items are returned.
106 * <p>
107 * The (immutable) results are limited to at most ABS_MAX_RESULTS items.
108 * <p>
109 * Excludes the exhibit itself and variants of the exhibit
110 * (same main words stem, etc, giving same canonicalised search term)
111 * on the assumption that they may already have been shown in other ways.
112 * <p>
113 * This allows potentially-redundant requests to execute in parallel
114 * at the risk of a little wasted work
115 * so as to maximise potential concurrency.
116 * <p>
117 * We do NOT cache any results if we don't believe the index
118 * to be up-to-date (eg to avoid cacheing stale/empty values).
119 *
120 * @return private String[] of result short exhibit names, best first;
121 * possibly zero-length but never null
122 */
123 public List<Name.ExhibitFull> getRelatedExhibits(final DataSourceBean dataSource,
124 final Name.ExhibitFull exhibitFullName,
125 final int maxResults)
126 throws IOException
127 {
128 if((dataSource == null) ||
129 (exhibitFullName == null) ||
130 (maxResults < 0))
131 { throw new IllegalArgumentException(); }
132
133 // Check if we have anything cached under the full exhibit name...
134 List<Name.ExhibitFull> r = fromFullName.get(exhibitFullName);
135 if(r == null)
136 {
137 // Nothing cached under the full exhibit name,
138 // so compute our search term which is the canonical cache key.
139
140 // By default cache the results that we compute.
141 boolean cacheResults = true;
142
143 // Neither compute nor cache results for a non-extant exhibit
144 // so as to bound the cache size (and save a little time).
145 final AllExhibitImmutableData aeid = dataSource.getAllExhibitProperties(-1).aeid;
146 if(!aeid.isPresent(exhibitFullName)) { return(Collections.emptyList()); }
147
148 // The search term is all the main and attribute words,
149 // canonicalised in the standard way for all searches.
150 final Name searchTerm = computeRawSearchTermFromExhibitName(exhibitFullName);
151 final Integer canonKey = Integer.valueOf(searchTerm.hashCode()); // Relies on good hash.
152
153 // See if cached under the canonical search term.
154 r = fromCanonKey.get(canonKey);
155 // No, not cached under the canonical search term either,
156 // so we'll really have to use the main index!
157 if(r == null)
158 {
159 final SortedSet<String> attrWords = ExhibitAttrUtils.getAttrWords().getAttrWordsSortedSet();
160
161 // Set up the exhibit filter for the index bean.
162 // We arrange to reject the original exhibit,
163 // and all close variants of it
164 // (those that produce the same canonical search term).
165 // We reject multiple close variants of any other exhibit
166 // so as to give maximum variety in the results.
167 final JIndexBean.SearchFilterByName filter =
168 new JIndexBean.SearchFilterByName(){
169 /**Thread-safe set of exhibit name stems already accepted; never null. */
170 private final Set<CharSequence8Bit> alreadyAccepted =
171 Collections.synchronizedSet(new HashSet<CharSequence8Bit>(16+2*ABS_MAX_RESULTS));
172
173 /**Accept or reject based on persistable key used as document name. */
174 public final boolean accept(final String pKey)
175 {
176 // Convert persistable key (base on short exhibit name) to full name.
177 final Name.ExhibitFull fn = aeid.getFullNameFromPeristableKey(pKey);
178 if(fn == null) { return(false); }
179 final CharSequence st = (computeRawSearchTermFromExhibitName(fn));
180
181 // Reject if the canonicalised search term is the same.
182 if(TextUtils.contentEquals(st, searchTerm)) { return(false); }
183
184 // Our stem excludes attribute words here
185 // and is usable as a key given the working hashCode and equals().
186 final CharSequence8Bit stem = fn.getShortName().getMainWordsComponent(attrWords);
187 assert(stem instanceof MemoryTools.Internable) : "has to have working hashCode() and equals()";
188 // If we already accepted same stem earlier then reject this.
189 if(!alreadyAccepted.add(stem)) { return(false); }
190
191 // OK, accept this!
192 return(true);
193 }
194 };
195
196 // If we find the index is not up-to-date right now
197 // then don't cache the results after computing them.
198 if(!dataSource.byWordIndexIsAvailableAndUpToDate())
199 { cacheResults = false; }
200
201 // Now really do the lookup/search.
202 // Don't allow fallback to wide searches
203 // that may litter a page with confusing spurious links.
204 // If nothing else, such spurious links and text
205 // may confuse classification engines
206 // such as that in Google's AdSense "Mediabot".
207 r = SearchUtils.doRelatedExhibitsSearch(searchTerm,
208 dataSource,
209 ABS_MAX_RESULTS,
210 filter,
211 false,
212 true);
213
214 // Cache under the canonical term.
215 if(cacheResults) { fromCanonKey.put(canonKey, r); }
216
217 //if(IsDebug.isDebug) { System.out.println("SearchResultSimpleCache: found/cached "+(raw.length)+" result(s) for exhibit "+exhibitName+" with canonical search term: " + searchTerm); }
218 }
219
220 // Cache the results (including empty results) under the full name.
221 if(cacheResults) { fromFullName.put(exhibitFullName, r); }
222 }
223
224 // If the request was for fewer results than we happen to have cached,
225 // return a sub-list view (still immutable)...
226 if(maxResults < r.size())
227 { return(r.subList(0, maxResults)); }
228
229 // Return the entire computed/cached result.
230 return(r);
231 }
232
233 /**Compute the (immutable) raw core lookup term in the by-word index from text in the exhibit name itself; never null.
234 * This uses the main words component and attribute words, mono-cased.
235 * <p>
236 * This keeps the constituent words in the order supplied,
237 * and always retains the first word (later duplicates may be discarded for example)
238 * as earlier words are taken to be more important.
239 *
240 * @return non-null, immutable, intern()ed, core lookup term for automated searches
241 */
242 public static Name computeRawSearchTermFromExhibitName(final Name.ExhibitFull exhibitFullName)
243 {
244 final CharSequence r = DataSourceBean.canonicaliseSimpleByWordQuery(
245 exhibitFullName.getShortName().getMainWordsComponent(Collections.<String>emptySet()).toString(),
246 ExhibitName.MAX_STEM_LENGTH);
247 return(Name.create(r)); // Store as a compact immutable (intern()ed) Name.
248 }
249
250 /**Private lookup key for full searches of similar items; never null. */
251 private static final DataSourceBean.AEPLinkedKey fullSearchCacheKey = new DataSourceBean.AEPLinkedKey("fullSearchCacheKey");
252
253 /**Do cached lookup for cat page "similar items"; never null.
254 * The (immutable) result is keyed to the exhibit name and AEP.
255 */
256 public static List<Name.ExhibitFull> doCachedCatPageSimilarItems(final DataSourceBean dsb,
257 final Name.ExhibitFull exhibitName,
258 final int maxToShow)
259 {
260 // Get existing cache, or create unique new one (eg after AEP change).
261 SearchResultSimpleCache cachedResults;
262 while((cachedResults = (SearchResultSimpleCache) dsb.getAEPLinkedValue(fullSearchCacheKey)) == null)
263 { dsb.putIfAbsentAEPLinkedValue(fullSearchCacheKey, new SearchResultSimpleCache()); }
264
265 try
266 {
267 return(cachedResults.getRelatedExhibits(dsb,
268 exhibitName,
269 maxToShow));
270 }
271 catch(final Exception e)
272 {
273 e.printStackTrace(); // Whinge but absorb any error.
274 final List<Name.ExhibitFull> emptyResult = Collections.emptyList();
275 return(emptyResult);
276 }
277 }
278
279 /**Do cached search lookup as for doCachedCatPageSimilarItems() but asynchronously; never null.
280 * The request is always run, possibly in this thread blocking the caller.
281 */
282 public static Future<List<Name.ExhibitFull>> doCachedCatPageSimilarItemsFuture(
283 final DataSourceBean dsb,
284 final Name.ExhibitFull exhibitFullName,
285 final int maxResults)
286 { return(doCachedCatPageSimilarItemsFuture(dsb, exhibitFullName, maxResults, false)); }
287
288 /**Do cached search lookup as for doCachedCatPageSimilarItems() but asynchronously; never null.
289 * @param discardable if true then this call will never block but the request is discardable,
290 * ie may never even be scheduled to run or may run at low priority,
291 * or may immediately return a real result
292 * or may immediately return an empty result if there seems no prospect of getting a task queued,
293 * so code should not wait indefinitely for a task completion (eg with get()) that may never come
294 */
295 public static Future<List<Name.ExhibitFull>> doCachedCatPageSimilarItemsFuture(
296 final DataSourceBean dsb,
297 final Name.ExhibitFull exhibitFullName,
298 final int maxResults,
299 final boolean discardable)
300 {
301 // Get existing cache, is any.
302 final SearchResultSimpleCache cachedResults = (SearchResultSimpleCache) dsb.getAEPLinkedValue(fullSearchCacheKey);
303 if(null != cachedResults)
304 {
305 // Check if we have anything cached under the full exhibit name...
306 final List<Name.ExhibitFull> r = cachedResults.fromFullName.get(exhibitFullName);
307 // Have cached result, so return already-finished value...
308 if(null != r)
309 { return(ThreadUtils.makeCompletedFuture(r)); }
310 }
311
312 // If we did not find a cached value
313 // then try to start a task to compute the value asynchronously
314 // unless the request is discardable and there appears to be no chance of doing so
315 // in which case return an empty result immediately.
316 if(discardable && !ThreadUtils.couldRunLowPriorityDiscardableTask())
317 {
318 if(IsDebug.isDebug) { System.err.println("WARNING: did not try to start doCachedCatPageSimilarItems() for "+exhibitFullName); }
319 return(ThreadUtils.makeCompletedFuture(Collections.<Name.ExhibitFull>emptyList()));
320 }
321 // Try to start a task...
322 final Callable<List<Name.ExhibitFull>> callable = new Callable<List<Name.ExhibitFull>>(){
323 public final List<Name.ExhibitFull> call() throws Exception
324 { return(doCachedCatPageSimilarItems(dsb, exhibitFullName, maxResults)); }
325 };
326 return((discardable ? ThreadUtils.lowPriorityThreadPoolDiscardable : ThreadUtils.computeIntensiveThreadPool).submit(callable));
327 }
328 }