001    /*
002    Copyright (c) 1996-2011, Damon Hart-Davis
003    All rights reserved.
004    
005    Redistribution and use in source and binary forms, with or without
006    modification, are permitted provided that the following conditions are
007    met:
008    
009      * Redistributions of source code must retain the above copyright
010        notice, this list of conditions and the following disclaimer.
011    
012      * Redistributions in binary form must reproduce the above copyright
013        notice, this list of conditions and the following disclaimer in the
014        documentation and/or other materials provided with the
015        distribution.
016    
017    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
018    IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
019    TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
020    PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
021    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
022    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
023    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
024    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
025    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
026    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
027    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
028    */
029    
030    package org.hd.d.pg2k.webSvr.util;
031    
032    import java.io.IOException;
033    import java.util.Collections;
034    import java.util.HashSet;
035    import java.util.List;
036    import java.util.Set;
037    import java.util.SortedSet;
038    import java.util.concurrent.Callable;
039    import java.util.concurrent.Future;
040    
041    import org.hd.d.pg2k.svrCore.AllExhibitImmutableData;
042    import org.hd.d.pg2k.svrCore.ExhibitAttrUtils;
043    import org.hd.d.pg2k.svrCore.ExhibitName;
044    import org.hd.d.pg2k.svrCore.MemoryTools;
045    import org.hd.d.pg2k.svrCore.Name;
046    import org.hd.d.pg2k.svrCore.TextUtils;
047    import org.hd.d.pg2k.svrCore.TextUtils.CharSequence8Bit;
048    import org.hd.d.pg2k.svrCore.ThreadUtils;
049    import org.hd.d.pg2k.webSvr.exhibit.DataSourceBean;
050    
051    import ORG.hd.d.IsDebug;
052    import ORG.hd.d.jIndexer.server.JIndexBean;
053    
054    /**Bean for cached search results.
055     * Uses SearchUtils.doRelatedExhibitsSearch() to compute results lazily
056     * and caches the results.
057     * <p>
058     * Cache size is limited at most in proportion to the number of exhibits
059     * (keyed from their names and from canonicalised forms of those names for shared results)
060     * and may be capped to a fixed ceiling below that to retain only the most popular lookups.
061     */
062    public final class SearchResultSimpleCache
063        {
064        /**Map from full exhibit name to immutable ordered list of full-name results, best match first.
065         * Thread-safe.
066         * <p>
067         * Capped size to minimise memory footprint while still giving good performance.
068         * Falling through the cache and doing the query from scratch shouldn't be hideously slow anyway.
069         * <p>
070         * Since the actual cached items are small (even compared to Map.Entry overheads),
071         * a relatively large number can be cached
072         * and it is not worth a complex high-overhead scheme, eg using SoftReferences, to manage them.
073         * <p>
074         * Limit maximum cache size in proportion to heap size; strictly positive.
075         * Aim to allow ~10000 entries per 1GB of heap space (at initialisation of this instance),
076         * with a minimum of tens/hundreds to cover popular pages, etc, at relatively low memory cost.
077         * <p>
078         * We're prepared to discard everything under acute memory pressure.
079         */
080        private final MemoryTools.CacheMiniMap<Name.ExhibitFull,List<Name.ExhibitFull>> fromFullName =
081            MemoryTools.SimpleProbabilisticCache.<Name.ExhibitFull,List<Name.ExhibitFull>>create(computeCacheMaxSize(), "fromFullName");
082    
083        /**Map from condensed canonical key to immutable ordered list of full-name results, best match first.
084         * Thread-safe.
085         * As fromFullName.
086         */
087        private final MemoryTools.CacheMiniMap<Integer,List<Name.ExhibitFull>> fromCanonKey =
088            MemoryTools.SimpleProbabilisticCache.<Integer,List<Name.ExhibitFull>>create(computeCacheMaxSize(), "fromCanonKey");
089    
090        /**Compute maximum cache size, partly based on heap size; strictly positive. */
091        private static int computeCacheMaxSize()
092            { return(Math.max(1<<8, (int) Math.min(1<<16, Runtime.getRuntime().totalMemory() >> 17))); }
093    
094        /**Absolute maximum number of results to store/cache per exhibit.
095         * Limits storage requirements,
096         * and is as many as we should ever want to show on one catalogue page.
097         * <p>
098         * If the catalogue page logic actually uses this value as the maximum request size
099         * then we should get optimal CPU and memory use, and minimal heap churn.
100         */
101        public static final int ABS_MAX_RESULTS = WebConsts.MAX_RESULTS_PER_PAGE/2;
102    
103        /**Do cached search lookup on given full exhibit name returning full exhibit name results; never null.
104         * If fewer items are requested than computed/cached,
105         * then the best (first) cached items are returned.
106         * <p>
107         * The (immutable) results are limited to at most ABS_MAX_RESULTS items.
108         * <p>
109         * Excludes the exhibit itself and variants of the exhibit
110         * (same main words stem, etc, giving same canonicalised search term)
111         * on the assumption that they may already have been shown in other ways.
112         * <p>
113         * This allows potentially-redundant requests to execute in parallel
114         * at the risk of a little wasted work
115         * so as to maximise potential concurrency.
116         * <p>
117         * We do NOT cache any results if we don't believe the index
118         * to be up-to-date (eg to avoid cacheing stale/empty values).
119         *
120         * @return  private String[] of result short exhibit names, best first;
121         *     possibly zero-length but never null
122         */
123        public List<Name.ExhibitFull> getRelatedExhibits(final DataSourceBean dataSource,
124                                                         final Name.ExhibitFull exhibitFullName,
125                                                         final int maxResults)
126            throws IOException
127            {
128            if((dataSource == null) ||
129               (exhibitFullName == null) ||
130               (maxResults < 0))
131                { throw new IllegalArgumentException(); }
132    
133            // Check if we have anything cached under the full exhibit name...
134            List<Name.ExhibitFull> r = fromFullName.get(exhibitFullName);
135            if(r == null)
136                {
137                // Nothing cached under the full exhibit name,
138                // so compute our search term which is the canonical cache key.
139    
140                // By default cache the results that we compute.
141                boolean cacheResults = true;
142    
143                // Neither compute nor cache results for a non-extant exhibit
144                // so as to bound the cache size (and save a little time).
145                final AllExhibitImmutableData aeid = dataSource.getAllExhibitProperties(-1).aeid;
146                if(!aeid.isPresent(exhibitFullName)) { return(Collections.emptyList()); }
147    
148                // The search term is all the main and attribute words,
149                // canonicalised in the standard way for all searches.
150                final Name searchTerm = computeRawSearchTermFromExhibitName(exhibitFullName);
151                final Integer canonKey = Integer.valueOf(searchTerm.hashCode()); // Relies on good hash.
152    
153                // See if cached under the canonical search term.
154                r = fromCanonKey.get(canonKey);
155                // No, not cached under the canonical search term either,
156                // so we'll really have to use the main index!
157                if(r == null)
158                    {
159                    final SortedSet<String> attrWords = ExhibitAttrUtils.getAttrWords().getAttrWordsSortedSet();
160    
161                    // Set up the exhibit filter for the index bean.
162                    // We arrange to reject the original exhibit,
163                    // and all close variants of it
164                    // (those that produce the same canonical search term).
165                    // We reject multiple close variants of any other exhibit
166                    // so as to give maximum variety in the results.
167                    final JIndexBean.SearchFilterByName filter =
168                        new JIndexBean.SearchFilterByName(){
169                            /**Thread-safe set of exhibit name stems already accepted; never null. */
170                            private final Set<CharSequence8Bit> alreadyAccepted =
171                                Collections.synchronizedSet(new HashSet<CharSequence8Bit>(16+2*ABS_MAX_RESULTS));
172    
173                            /**Accept or reject based on persistable key used as document name. */
174                            public final boolean accept(final String pKey)
175                                {
176                                // Convert persistable key (base on short exhibit name) to full name.
177                                final Name.ExhibitFull fn = aeid.getFullNameFromPeristableKey(pKey);
178                                if(fn == null) { return(false); }
179                                final CharSequence st = (computeRawSearchTermFromExhibitName(fn));
180    
181                                // Reject if the canonicalised search term is the same.
182                                if(TextUtils.contentEquals(st, searchTerm)) { return(false); }
183    
184                                // Our stem excludes attribute words here
185                                // and is usable as a key given the working hashCode and equals().
186                                final CharSequence8Bit stem = fn.getShortName().getMainWordsComponent(attrWords);
187                                assert(stem instanceof MemoryTools.Internable) : "has to have working hashCode() and equals()";
188                                // If we already accepted same stem earlier then reject this.
189                                if(!alreadyAccepted.add(stem)) { return(false); }
190    
191                                // OK, accept this!
192                                return(true);
193                                }
194                            };
195    
196                    // If we find the index is not up-to-date right now
197                    // then don't cache the results after computing them.
198                    if(!dataSource.byWordIndexIsAvailableAndUpToDate())
199                        { cacheResults = false; }
200    
201                    // Now really do the lookup/search.
202                    // Don't allow fallback to wide searches
203                    // that may litter a page with confusing spurious links.
204                    // If nothing else, such spurious links and text
205                    // may confuse classification engines
206                    // such as that in Google's AdSense "Mediabot".
207                    r = SearchUtils.doRelatedExhibitsSearch(searchTerm,
208                                                dataSource,
209                                                ABS_MAX_RESULTS,
210                                                filter,
211                                                false,
212                                                true);
213    
214                    // Cache under the canonical term.
215                    if(cacheResults) { fromCanonKey.put(canonKey, r); }
216    
217    //if(IsDebug.isDebug) { System.out.println("SearchResultSimpleCache: found/cached "+(raw.length)+" result(s) for exhibit "+exhibitName+" with canonical search term: " + searchTerm); }
218                    }
219    
220                // Cache the results (including empty results) under the full name.
221                if(cacheResults) { fromFullName.put(exhibitFullName, r); }
222                }
223    
224            // If the request was for fewer results than we happen to have cached,
225            // return a sub-list view (still immutable)...
226            if(maxResults < r.size())
227                { return(r.subList(0, maxResults)); }
228    
229            // Return the entire computed/cached result.
230            return(r);
231            }
232    
233        /**Compute the (immutable) raw core lookup term in the by-word index from text in the exhibit name itself; never null.
234         * This uses the main words component and attribute words, mono-cased.
235         * <p>
236         * This keeps the constituent words in the order supplied,
237         * and always retains the first word (later duplicates may be discarded for example)
238         * as earlier words are taken to be more important.
239         *
240         * @return  non-null, immutable, intern()ed, core lookup term for automated searches
241         */
242        public static Name computeRawSearchTermFromExhibitName(final Name.ExhibitFull exhibitFullName)
243            {
244            final CharSequence r = DataSourceBean.canonicaliseSimpleByWordQuery(
245                            exhibitFullName.getShortName().getMainWordsComponent(Collections.<String>emptySet()).toString(),
246                            ExhibitName.MAX_STEM_LENGTH);
247            return(Name.create(r)); // Store as a compact immutable (intern()ed) Name.
248            }
249    
250        /**Private lookup key for full searches of similar items; never null. */
251        private static final DataSourceBean.AEPLinkedKey fullSearchCacheKey = new DataSourceBean.AEPLinkedKey("fullSearchCacheKey");
252    
253        /**Do cached lookup for cat page "similar items"; never null.
254         * The (immutable) result is keyed to the exhibit name and AEP.
255         */
256        public static List<Name.ExhibitFull> doCachedCatPageSimilarItems(final DataSourceBean dsb,
257                                                                         final Name.ExhibitFull exhibitName,
258                                                                         final int maxToShow)
259            {
260            // Get existing cache, or create unique new one (eg after AEP change).
261            SearchResultSimpleCache cachedResults;
262            while((cachedResults = (SearchResultSimpleCache) dsb.getAEPLinkedValue(fullSearchCacheKey)) == null)
263                { dsb.putIfAbsentAEPLinkedValue(fullSearchCacheKey, new SearchResultSimpleCache()); }
264    
265            try
266                {
267                return(cachedResults.getRelatedExhibits(dsb,
268                                                        exhibitName,
269                                                        maxToShow));
270                }
271            catch(final Exception e)
272                {
273                e.printStackTrace();  // Whinge but absorb any error.
274                final List<Name.ExhibitFull> emptyResult = Collections.emptyList();
275                return(emptyResult);
276                }
277            }
278    
279        /**Do cached search lookup as for doCachedCatPageSimilarItems() but asynchronously; never null.
280         * The request is always run, possibly in this thread blocking the caller.
281         */
282        public static Future<List<Name.ExhibitFull>> doCachedCatPageSimilarItemsFuture(
283                                               final DataSourceBean dsb,
284                                               final Name.ExhibitFull exhibitFullName,
285                                               final int maxResults)
286            { return(doCachedCatPageSimilarItemsFuture(dsb, exhibitFullName, maxResults, false)); }
287    
288        /**Do cached search lookup as for doCachedCatPageSimilarItems() but asynchronously; never null.
289         * @param discardable if true then this call will never block but the request is discardable,
290         *     ie may never even be scheduled to run or may run at low priority,
291         *     or may immediately return a real result
292         *     or may immediately return an empty result if there seems no prospect of getting a task queued,
293         *     so code should not wait indefinitely for a task completion (eg with get()) that may never come
294         */
295        public static Future<List<Name.ExhibitFull>> doCachedCatPageSimilarItemsFuture(
296                                               final DataSourceBean dsb,
297                                               final Name.ExhibitFull exhibitFullName,
298                                               final int maxResults,
299                                               final boolean discardable)
300            {
301            // Get existing cache, is any.
302            final SearchResultSimpleCache cachedResults = (SearchResultSimpleCache) dsb.getAEPLinkedValue(fullSearchCacheKey);
303            if(null != cachedResults)
304                {
305                // Check if we have anything cached under the full exhibit name...
306                final List<Name.ExhibitFull> r = cachedResults.fromFullName.get(exhibitFullName);
307                // Have cached result, so return already-finished value...
308                if(null != r)
309                    { return(ThreadUtils.makeCompletedFuture(r)); }
310                }
311    
312            // If we did not find a cached value
313            // then try to start a task to compute the value asynchronously
314            // unless the request is discardable and there appears to be no chance of doing so
315            // in which case return an empty result immediately.
316            if(discardable && !ThreadUtils.couldRunLowPriorityDiscardableTask())
317                {
318    if(IsDebug.isDebug) { System.err.println("WARNING: did not try to start doCachedCatPageSimilarItems() for "+exhibitFullName); }
319                return(ThreadUtils.makeCompletedFuture(Collections.<Name.ExhibitFull>emptyList()));
320                }
321            // Try to start a task...
322            final Callable<List<Name.ExhibitFull>> callable = new Callable<List<Name.ExhibitFull>>(){
323                public final List<Name.ExhibitFull> call() throws Exception
324                    { return(doCachedCatPageSimilarItems(dsb, exhibitFullName, maxResults)); }
325                };
326            return((discardable ? ThreadUtils.lowPriorityThreadPoolDiscardable : ThreadUtils.computeIntensiveThreadPool).submit(callable));
327            }
328        }