001    /*
002    Copyright (c) 1996-2012, Damon Hart-Davis
003    All rights reserved.
004    
005    Redistribution and use in source and binary forms, with or without
006    modification, are permitted provided that the following conditions are
007    met:
008    
009      * Redistributions of source code must retain the above copyright
010        notice, this list of conditions and the following disclaimer.
011    
012      * Redistributions in binary form must reproduce the above copyright
013        notice, this list of conditions and the following disclaimer in the
014        documentation and/or other materials provided with the
015        distribution.
016    
017    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
018    IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
019    TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
020    PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
021    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
022    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
023    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
024    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
025    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
026    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
027    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
028    */
029    
030    package org.hd.d.pg2k.webSvr.util;
031    
032    import java.io.IOException;
033    import java.util.Collections;
034    import java.util.List;
035    import java.util.regex.Pattern;
036    
037    import org.hd.d.pg2k.svrCore.AllExhibitProperties;
038    import org.hd.d.pg2k.svrCore.ExhibitName;
039    import org.hd.d.pg2k.svrCore.Name;
040    import org.hd.d.pg2k.svrCore.Name.ExhibitFull;
041    import org.hd.d.pg2k.svrCore.TextUtils;
042    import org.hd.d.pg2k.webSvr.exhibit.DataSourceBean;
043    
044    import ORG.hd.d.jIndexer.server.JIndexBean;
045    
046    /**Search related utility functions.
047     * This is for algorithms only of interest to Web apps, often JSPs.
048     * <p>
049     * One advantage of having a method here rather than in-line in a JSP
050     * is that is is pre-compiled off-line for speed and robustness.
051     */
052    public final class SearchUtils
053        {
054        /**Prevent construction of an instance. */
055        private SearchUtils() { }
056    
057        /**Compute immutable full exhibit names of "similar" items, eg in the foot of each catalogue page; never null.
058         * The search term is assumed to be some leading part of the
059         * (short) exhibit name, ie excluding the directory components.
060         * <p>
061         * This may try a narrow search, insisting that
062         * the first search word must appear in candidate result exhibit names and that
063         * preferably most search terms should be matched,
064         * optionally falling back to a wider search if the narrow one fails.
065         * <p>
066         * This does not cache its results.
067         * <p>
068         * When this returns an empty value it always returns the same instance.
069         *
070         * @param searchTerm non-null, non-empty search term(s);
071         *     earlier words are possibly more significant
072         * @param dataSource  data source; never null
073         * @param maxResults maximum number of results; never null
074         * @param filter  non-null filter of acceptable exhibits
075         * @param fallback if true and a narrow search fails to generate answers,
076         *     fall back to a wider search
077         * @param alreadyCanon if true then the searchTerm is already canonicalised
078         * @throws IOException in case of error
079         *
080         * @return in-order list of matches as full exhibit names, best first,
081         *     up to requested maximum number of results;
082         *     may be zero-length but never null
083         */
084        public static List<Name.ExhibitFull> doRelatedExhibitsSearch(final Name searchTerm,
085                                                       final DataSourceBean dataSource,
086                                                       final int maxResults,
087                                                       final JIndexBean.SearchFilterByName filter,
088                                                       final boolean fallback,
089                                                       final boolean alreadyCanon)
090            throws IOException
091            {
092            if((searchTerm == null) || (searchTerm.length() == 0) ||
093               (dataSource == null) ||
094               (maxResults < 1) ||
095               (filter == null))
096                { throw new IllegalArgumentException(); }
097    
098            // Ensure that we have the search terms canonicalised in a simple way.
099            // This means order preserved,
100            // but all converted to lower-case
101            // and all dashes converted to spaces.
102            final Name canonTerms = alreadyCanon ? searchTerm :
103                Name.create(DataSourceBean.canonicaliseSimpleByWordQuery(searchTerm, ExhibitName.MAX_STEM_LENGTH));
104    
105            // Compute the first word from the search terms.
106            final int firstSpace = TextUtils.indexOf(canonTerms, ' ');
107            final CharSequence firstWord = (firstSpace < 1) ?
108                canonTerms : canonTerms.subSequence(0, firstSpace);
109    
110            // Compile a regex for a case-insensitive whole-word match.
111            // By insisting on the explicit trailing separator after the word
112            // we implicitly exclude the author initials and extension from any match.
113            final Pattern p = Pattern.compile("\\b" + firstWord + ExhibitName.WORD_SEP, Pattern.CASE_INSENSITIVE);
114    
115            // Try a narrow-ish search,
116            // asking for most of the terms to be matched if possible,
117            // and insisting that the main (first) search word is in the result name.
118            // (We'll accept a crude case-insensitive substring match first,
119            // then an exact case-insensitive match.)
120            final AllExhibitProperties aep = dataSource.getAllExhibitProperties(-1);
121            final List<Name.ExhibitFull> resultNarrow = dataSource.findExhibitsByWord(canonTerms,
122                                          DataSourceBean.FEBY_MATCH_TYPE_MOST,
123                                          maxResults,
124                new JIndexBean.SearchFilterByName(){
125                    /**Accept or reject based on name. */
126                    public final boolean accept(final CharSequence n)
127                        {
128                        final ExhibitFull fullName = (ExhibitFull) n; // aep.aeid.getFullNameFromPersistableKey(pKey);
129                        if(null == fullName) { return(false); }
130    
131                        // Reject unless we can find entire first/main word
132                        // as an entire word within the specified name.
133                        // For speed we will allow a match against any part
134                        // except the extension and author initials
135                        // (ie including any attribute words).
136                        if(!p.matcher(fullName.getShortName().getMainWordsComponent(Collections.<String>emptySet())).find())
137                            { return(false); }
138    
139                        // Reject if supplied filter rejects.
140                        return(filter.accept(n));
141                        }
142                    });
143            if(resultNarrow.size() > 0)
144                { return(resultNarrow); }
145    
146            // Possibly allow fall back to much wider searches.
147            if(fallback)
148                {
149                // Try doing a match insisting on most terms being present.
150                final List<Name.ExhibitFull> resultWide = dataSource.findExhibitsByWord(canonTerms,
151                                              DataSourceBean.FEBY_MATCH_TYPE_MOST,
152                                              maxResults,
153                                              filter);
154                if(resultWide.size() > 0)
155                    { return(resultWide); }
156    
157                // Try doing a match on any terms.
158                final List<Name.ExhibitFull> resultWider = dataSource.findExhibitsByWord(canonTerms,
159                                              DataSourceBean.FEBY_MATCH_TYPE_ANY,
160                                              maxResults,
161                                              filter);
162                if(resultWider.size() > 0)
163                    { return(resultWider); }
164                }
165    
166            return(Collections.emptyList());
167            }
168        }