001 /*
002 Copyright (c) 1996-2012, Damon Hart-Davis
003 All rights reserved.
004
005 Redistribution and use in source and binary forms, with or without
006 modification, are permitted provided that the following conditions are
007 met:
008
009 * Redistributions of source code must retain the above copyright
010 notice, this list of conditions and the following disclaimer.
011
012 * Redistributions in binary form must reproduce the above copyright
013 notice, this list of conditions and the following disclaimer in the
014 documentation and/or other materials provided with the
015 distribution.
016
017 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
018 IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
019 TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
020 PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
021 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
022 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
023 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
024 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
025 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
026 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
027 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
028 */
029
030 package org.hd.d.pg2k.webSvr.exhibit;
031
032 import java.util.regex.Pattern;
033
034 import javax.servlet.ServletContext;
035 import javax.servlet.http.HttpServletRequest;
036
037 import org.hd.d.pg2k.svrCore.Rnd;
038 import org.hd.d.pg2k.svrCore.TextUtils;
039 import org.hd.d.pg2k.svrCore.props.GenProps;
040 import org.hd.d.pg2k.svrCore.vars.EventPeriod;
041 import org.hd.d.pg2k.svrCore.vars.SimpleVariablePipelineIF;
042 import org.hd.d.pg2k.svrCore.vars.SimpleVariableValue;
043 import org.hd.d.pg2k.svrCore.vars.SystemVariables;
044
045 /**Servlet-related utility functions.
046 * This mainly consists of small support routines for servlets in this package.
047 */
048 public final class ServletUtils
049 {
050 /**Prevent construction of an instance. */
051 private ServletUtils() { }
052
053
054 /**Default maximum percent of all exhibit requests that may come from one external source without us blocking them.
055 * Normally in the range [1,100].
056 * <p>
057 * A value of a few percent should allow legitimate referrals from bona fide image search engines
058 * and low volumes of traffic from small sites that we don't object to,
059 * but should veto egregious heavy misuse of bandwidth.
060 * <p>
061 * At 20050205 a threshold of about 25% would just about let in all legitimate referrals
062 * from the highest-volume search engines (images.search.yahoo.com, images.google.com)
063 * and disallow most of the unwanted hotlinking from third-party sights.
064 * This should prove itself to be reasonably conservative,
065 * and is used if no explicit GenProps values is supplied.
066 * <p>
067 * As of 20100606 the value that had been in use for some time was 5%.
068 */
069 private static final int DEFAULT_MAX_HOTLINK_REQ_PERCENT = 5;
070
071 /**Max (right-most) characters of putative hotlinker hostname retained; strictly positive.
072 * One less than a power of 2 so that adding a unique marker character to mark the truncation
073 * still makes for an efficient array size.
074 * <p>
075 * Empirically chosen based on noise captured by existing system as of 2011/09/19.
076 */
077 public static final int MAX_RHS_SANITISED_HOSTNAME_CHARS = 31;
078
079 /**Max (right-most) domain components of putative hotlinker hostname retained; strictly positive.
080 * Discard left-most, less-significant sub-domain data for prune noise.
081 * <p>
082 * If at least 4 then IPv4 literals will be preserved.
083 * <p>
084 * Empirically chosen based on noise captured by existing system as of 2011/09/19.
085 */
086 public static final int MAX_RHS_SANITISED_HOSTNAME_PARTS = 4;
087
088 /**Check if an apparent hotlinked request for an (exhibit or thumbnail) should be blocked.
089 * May record the request (even spiders / "" referrals that we will never block),
090 * and if the referrer is especially egregious will return true
091 * to suggest rejecting the request completely.
092 * A false return may still treats the request as potentially dodgy,
093 * but suggests that it be allowed to continue.
094 * <p>
095 * Absent or unparsable referrer URLs are not blocked by this mechanism
096 * (as these may be spiders or users running with "Referer" turned off for security reasons).
097 * They are dealt with in other ways.
098 * <p>
099 * Only call this where the referring URL seems dubious.
100 * <p>
101 * Algorithm for blocking requests by referrer:
102 * <ol>
103 * <li>If the host is on the "allow" list, then allow the request.
104 * <li>Else if the host is on the "block" list, then block the request.
105 * <li>If there were no hotlinks from this host yesterday,
106 * then block the request.
107 * The aim of this is to discourage people setting up hotlinks
108 * to start with from their own pages since they just won't work.
109 * <li>Else if the requests from this referrer today/yesterday have amounted to
110 * (say) 10% or more of all exhibit requests in that period, then block the request.
111 * (Note that thumbnail requests should normally be much less common than exhibit requests
112 * because they can be cached for a very long time.)
113 * </ol>
114 * <p>
115 * This may further sanitise and normalise the host name to maximise the signal-to-noise ratio,
116 * and reduce opportunities for malice.
117 * <p>
118 * Only package visible since only needed by servlets in this package.
119 *
120 * @param request the incoming request; never null
121 * @param normalisedReferringHost the normalised hotlinking host's name; never null
122 * @param vars the DataSource; never null
123 *
124 * @return true if this request should probably be blocked, false if request should be allowed
125 */
126 static boolean noteAndOrBlockHotlinker(final ServletContext context,
127 final HttpServletRequest request,
128 final String normalisedReferringHost,
129 final SimpleVariablePipelineIF vars,
130 final GenProps gp)
131 {
132 assert((context != null) &&
133 (request != null) &&
134 (normalisedReferringHost != null) &&
135 (vars != null) &&
136 (gp != null));
137
138 // Immediately mark the connection as low-priority to allow us to conserve bandwidth, etc.
139 request.setAttribute("LowPriConn", Boolean.TRUE);
140
141 //System.err.println("WARNING: seeing possible hotlink to exhibit/thumbnail from host/site: " + normalisedReferringHost);
142
143 // We don't log or block requests with no referring URL or an unparsable one...
144 // (Otherwise we'd never let spiders in, for example.)
145 if("".equals(normalisedReferringHost))
146 { return(false); }
147
148 // Further normalise/sanitise the host name if long for example.
149 final String sanitized = sanitiseReferrerHostName(normalisedReferringHost);
150
151 // Now record the dubious referring URL to keep tabs on such activity...
152 // Note that the attempt is recorded even if we reject it below.
153 try
154 {
155 // Record the referring host as an event...
156 final SimpleVariableValue svv = new SimpleVariableValue(SystemVariables.ACCESSPATTERN_EX_HOTLINK_REF_HOST,
157 sanitized);
158 vars.setVariable(svv);
159 }
160 catch(final Exception e) // Absorb any errors and continue, but whinge...
161 { e.printStackTrace(); }
162
163
164 // Decide whether this request should be rejected...
165
166 // If the referring host is on our allow list then allow it immediately.
167 // This can be used to quickly deal with well-known friends.
168 if(gp.getHotLinkAllowHosts().contains(sanitized))
169 { return(false); /* Allow. */}
170 final Pattern hotLinkAllowHostsRegex = gp.getHotLinkAllowHostsRegex();
171 if((hotLinkAllowHostsRegex != null) &&
172 hotLinkAllowHostsRegex.matcher(sanitized).matches())
173 { return(false); /* Allow. */}
174
175 // If the referring host is on disallow list then block it immediately.
176 // This can be used to quickly deal with well-known transgressors.
177 if(gp.getHotLinkDisallowHosts().contains(sanitized))
178 { return(true); /* Block. */ }
179 final Pattern hotLinkDisallowHostsRegex = gp.getHotLinkDisallowHostsRegex();
180 if((hotLinkDisallowHostsRegex != null) &&
181 hotLinkDisallowHostsRegex.matcher(sanitized).matches())
182 { return(true); /* Block. */ }
183
184 try
185 {
186 // If there were no hotlink referrals from this host
187 // in the previous period,
188 // then we guess that this might be someone
189 // designing a new page with the hotlink to us in it,
190 // so refuse the hotlink outright so as to try to discourage them!
191 // Genuine search engines, etc, that refer every day,
192 // should stay clear of this block.
193 // This should be relatively fast to check,
194 // especially as the "previous period" events should be cached locally.
195 final int prevCount =
196 vars.getEventValue(SystemVariables.ACCESSPATTERN_EX_HOTLINK_REF_HOST, EventPeriod.VLONG, false).
197 getCount(sanitized);
198 if(prevCount < 1)
199 {
200 context.log("WARNING: reject/divert of NEW hotlinker (sanitised host "+sanitized+") Referer: " + TextUtils.sanitiseForXML(request.getHeader("Referer"), 256, true));
201 return(true); /* Block. */
202 }
203
204 // If more than the allowable percentage of all exhibit downloads
205 // have come from this referring URL in the current or previous period,
206 // then disallow this request.
207
208 // We include the current period for a fast response to new loads even though partial/noisy.
209 // We include the previous period to give us some memory and a stable system-wide sample.
210
211 // Total completed exhibit downloads in current/previous periods.
212 final int totalExDownloads =
213 vars.getEventValue(SystemVariables.ACCESSPATTERN_COMPLETED_DOWNLOAD, EventPeriod.VLONG, true).
214 getTotalEventCount() +
215 vars.getEventValue(SystemVariables.ACCESSPATTERN_COMPLETED_DOWNLOAD, EventPeriod.VLONG, false).
216 getTotalEventCount();
217
218 // Total attempted exhibit and thumbnail direct referrals from same host/site as this request.
219 final int totalAttemptedReferrals =
220 vars.getEventValue(SystemVariables.ACCESSPATTERN_EX_HOTLINK_REF_HOST, EventPeriod.VLONG, true).
221 getCount(sanitized) +
222 prevCount;
223
224 // If too many referrals from this one host/site, then block this request.
225 final int rawThresholdPC = gp.getWEBSVR_EX_HOTLINK_LIMITER();
226 // A zero GenProps threshold percentage means "use the hard-coded (conservative) default".
227 final int thresholdPC = (rawThresholdPC > 0) ? rawThresholdPC : DEFAULT_MAX_HOTLINK_REQ_PERCENT;
228 final int threshold = ((totalExDownloads * thresholdPC) / 100);
229 if(totalAttemptedReferrals > threshold)
230 { return(true); /* Block the request... */ }
231 // When close to the threshold, "brown-out" and reject some connections randomly as a warning...
232 if(totalAttemptedReferrals >= ((4*threshold)/5))
233 {
234 context.log("WARNING: starting to dynamically reject/divert direct exhibit hotlinks (threshold="+threshold+", "+thresholdPC+"%) with Referer: " + TextUtils.sanitiseForXML(request.getHeader("Referer"), 256, true));
235 if(Rnd.fastRnd.nextBoolean())
236 {
237 return(true); /* Block the request... */
238 }
239 }
240 }
241 catch(final Exception e)
242 {
243 // Absorb unexpected error and let request the user's go through...
244 e.printStackTrace(); // Log the unexpected problem...
245 }
246
247 return(false); // Let the request go through...
248 }
249
250 /**Compiled pattern for splitting domain name into components; not null. */
251 private static final Pattern _dsp = Pattern.compile("[.]");
252
253 /**Further normalise/sanitise the host name if long for example; never null.
254 * A sanitised domain name, if altered, is (deliberately) not itself a valid domain name.
255 * <p>
256 * Note that normally excess length is truncated on the left,
257 * but for a literal IPv6 address (starting with '[') truncation happens on the right
258 * to preserve the most significant portions,
259 */
260 public static String sanitiseReferrerHostName(final String normalisedReferringHost)
261 {
262 // Ensure lower-case if not already so.
263 String sanitized = normalisedReferringHost.toLowerCase();
264 // Slightly special handling for raw IPv6 addresses in URLs.
265 final boolean isIPv6Literal = sanitized.startsWith("[");
266
267 // Replace all non-ASCII non-legitimate-domain-name chars with unique marker char.
268 // Allows characters for IPv6 (and IPv4) literals.
269 for(int i = sanitized.length(); --i >= 0; )
270 {
271 final char c = sanitized.charAt(i);
272 if(!((c >= 'a') && (c <= 'z')) && !((c >= '0') && (c <= '9')) &&
273 (c != '.') && (c != '-') &&
274 (c != ']') && (c != ':') && (c != '['))
275 { sanitized = sanitized.replace(c, '?'); }
276 }
277
278 // Truncate excess length (usually) from the left-hand side and prefix/suffix with unique marker char.
279 if(sanitized.length() > MAX_RHS_SANITISED_HOSTNAME_CHARS)
280 {
281 if(isIPv6Literal)
282 { sanitized = sanitized.substring(0, MAX_RHS_SANITISED_HOSTNAME_CHARS) + "+"; }
283 else
284 { sanitized = "+" + sanitized.substring(sanitized.length() - MAX_RHS_SANITISED_HOSTNAME_CHARS); }
285 }
286
287 // If the residue contains excessive levels of domain/zone then prune.
288 final String[] components = _dsp.split(sanitized, 0);
289 final int nParts = components.length;
290 if(nParts > MAX_RHS_SANITISED_HOSTNAME_PARTS)
291 {
292 sanitized = '.' + components[nParts-4] + '.' + components[nParts-3] + '.' + components[nParts-2] + '.' + components[nParts-1];
293 }
294
295 return(sanitized);
296 }
297 }