001 /*
002 Copyright (c) 1996-2012, Damon Hart-Davis
003 All rights reserved.
004
005 Redistribution and use in source and binary forms, with or without
006 modification, are permitted provided that the following conditions are
007 met:
008
009 * Redistributions of source code must retain the above copyright
010 notice, this list of conditions and the following disclaimer.
011
012 * Redistributions in binary form must reproduce the above copyright
013 notice, this list of conditions and the following disclaimer in the
014 documentation and/or other materials provided with the
015 distribution.
016
017 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
018 IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
019 TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
020 PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
021 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
022 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
023 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
024 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
025 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
026 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
027 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
028 */
029
030 package org.hd.d.pg2k.webSvr.util;
031
032 import java.io.IOException;
033 import java.util.Collections;
034 import java.util.List;
035 import java.util.regex.Pattern;
036
037 import org.hd.d.pg2k.svrCore.AllExhibitProperties;
038 import org.hd.d.pg2k.svrCore.ExhibitName;
039 import org.hd.d.pg2k.svrCore.Name;
040 import org.hd.d.pg2k.svrCore.Name.ExhibitFull;
041 import org.hd.d.pg2k.svrCore.TextUtils;
042 import org.hd.d.pg2k.webSvr.exhibit.DataSourceBean;
043
044 import ORG.hd.d.jIndexer.server.JIndexBean;
045
046 /**Search related utility functions.
047 * This is for algorithms only of interest to Web apps, often JSPs.
048 * <p>
049 * One advantage of having a method here rather than in-line in a JSP
050 * is that is is pre-compiled off-line for speed and robustness.
051 */
052 public final class SearchUtils
053 {
054 /**Prevent construction of an instance. */
055 private SearchUtils() { }
056
057 /**Compute immutable full exhibit names of "similar" items, eg in the foot of each catalogue page; never null.
058 * The search term is assumed to be some leading part of the
059 * (short) exhibit name, ie excluding the directory components.
060 * <p>
061 * This may try a narrow search, insisting that
062 * the first search word must appear in candidate result exhibit names and that
063 * preferably most search terms should be matched,
064 * optionally falling back to a wider search if the narrow one fails.
065 * <p>
066 * This does not cache its results.
067 * <p>
068 * When this returns an empty value it always returns the same instance.
069 *
070 * @param searchTerm non-null, non-empty search term(s);
071 * earlier words are possibly more significant
072 * @param dataSource data source; never null
073 * @param maxResults maximum number of results; never null
074 * @param filter non-null filter of acceptable exhibits
075 * @param fallback if true and a narrow search fails to generate answers,
076 * fall back to a wider search
077 * @param alreadyCanon if true then the searchTerm is already canonicalised
078 * @throws IOException in case of error
079 *
080 * @return in-order list of matches as full exhibit names, best first,
081 * up to requested maximum number of results;
082 * may be zero-length but never null
083 */
084 public static List<Name.ExhibitFull> doRelatedExhibitsSearch(final Name searchTerm,
085 final DataSourceBean dataSource,
086 final int maxResults,
087 final JIndexBean.SearchFilterByName filter,
088 final boolean fallback,
089 final boolean alreadyCanon)
090 throws IOException
091 {
092 if((searchTerm == null) || (searchTerm.length() == 0) ||
093 (dataSource == null) ||
094 (maxResults < 1) ||
095 (filter == null))
096 { throw new IllegalArgumentException(); }
097
098 // Ensure that we have the search terms canonicalised in a simple way.
099 // This means order preserved,
100 // but all converted to lower-case
101 // and all dashes converted to spaces.
102 final Name canonTerms = alreadyCanon ? searchTerm :
103 Name.create(DataSourceBean.canonicaliseSimpleByWordQuery(searchTerm, ExhibitName.MAX_STEM_LENGTH));
104
105 // Compute the first word from the search terms.
106 final int firstSpace = TextUtils.indexOf(canonTerms, ' ');
107 final CharSequence firstWord = (firstSpace < 1) ?
108 canonTerms : canonTerms.subSequence(0, firstSpace);
109
110 // Compile a regex for a case-insensitive whole-word match.
111 // By insisting on the explicit trailing separator after the word
112 // we implicitly exclude the author initials and extension from any match.
113 final Pattern p = Pattern.compile("\\b" + firstWord + ExhibitName.WORD_SEP, Pattern.CASE_INSENSITIVE);
114
115 // Try a narrow-ish search,
116 // asking for most of the terms to be matched if possible,
117 // and insisting that the main (first) search word is in the result name.
118 // (We'll accept a crude case-insensitive substring match first,
119 // then an exact case-insensitive match.)
120 final AllExhibitProperties aep = dataSource.getAllExhibitProperties(-1);
121 final List<Name.ExhibitFull> resultNarrow = dataSource.findExhibitsByWord(canonTerms,
122 DataSourceBean.FEBY_MATCH_TYPE_MOST,
123 maxResults,
124 new JIndexBean.SearchFilterByName(){
125 /**Accept or reject based on name. */
126 public final boolean accept(final CharSequence n)
127 {
128 final ExhibitFull fullName = (ExhibitFull) n; // aep.aeid.getFullNameFromPersistableKey(pKey);
129 if(null == fullName) { return(false); }
130
131 // Reject unless we can find entire first/main word
132 // as an entire word within the specified name.
133 // For speed we will allow a match against any part
134 // except the extension and author initials
135 // (ie including any attribute words).
136 if(!p.matcher(fullName.getShortName().getMainWordsComponent(Collections.<String>emptySet())).find())
137 { return(false); }
138
139 // Reject if supplied filter rejects.
140 return(filter.accept(n));
141 }
142 });
143 if(resultNarrow.size() > 0)
144 { return(resultNarrow); }
145
146 // Possibly allow fall back to much wider searches.
147 if(fallback)
148 {
149 // Try doing a match insisting on most terms being present.
150 final List<Name.ExhibitFull> resultWide = dataSource.findExhibitsByWord(canonTerms,
151 DataSourceBean.FEBY_MATCH_TYPE_MOST,
152 maxResults,
153 filter);
154 if(resultWide.size() > 0)
155 { return(resultWide); }
156
157 // Try doing a match on any terms.
158 final List<Name.ExhibitFull> resultWider = dataSource.findExhibitsByWord(canonTerms,
159 DataSourceBean.FEBY_MATCH_TYPE_ANY,
160 maxResults,
161 filter);
162 if(resultWider.size() > 0)
163 { return(resultWider); }
164 }
165
166 return(Collections.emptyList());
167 }
168 }