001    /*
002    Copyright (c) 1996-2012, Damon Hart-Davis
003    All rights reserved.
004    
005    Redistribution and use in source and binary forms, with or without
006    modification, are permitted provided that the following conditions are
007    met:
008    
009      * Redistributions of source code must retain the above copyright
010        notice, this list of conditions and the following disclaimer.
011    
012      * Redistributions in binary form must reproduce the above copyright
013        notice, this list of conditions and the following disclaimer in the
014        documentation and/or other materials provided with the
015        distribution.
016    
017    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
018    IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
019    TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
020    PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
021    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
022    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
023    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
024    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
025    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
026    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
027    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
028    */
029    
030    package org.hd.d.pg2k.webSvr.exhibit;
031    
032    import java.util.regex.Pattern;
033    
034    import javax.servlet.ServletContext;
035    import javax.servlet.http.HttpServletRequest;
036    
037    import org.hd.d.pg2k.svrCore.Rnd;
038    import org.hd.d.pg2k.svrCore.TextUtils;
039    import org.hd.d.pg2k.svrCore.props.GenProps;
040    import org.hd.d.pg2k.svrCore.vars.EventPeriod;
041    import org.hd.d.pg2k.svrCore.vars.SimpleVariablePipelineIF;
042    import org.hd.d.pg2k.svrCore.vars.SimpleVariableValue;
043    import org.hd.d.pg2k.svrCore.vars.SystemVariables;
044    
045    /**Servlet-related utility functions.
046     * This mainly consists of small support routines for servlets in this package.
047     */
048    public final class ServletUtils
049        {
050        /**Prevent construction of an instance. */
051        private ServletUtils() { }
052    
053    
054        /**Default maximum percent of all exhibit requests that may come from one external source without us blocking them.
055         * Normally in the range [1,100].
056         * <p>
057         * A value of a few percent should allow legitimate referrals from bona fide image search engines
058         * and low volumes of traffic from small sites that we don't object to,
059         * but should veto egregious heavy misuse of bandwidth.
060         * <p>
061         * At 20050205 a threshold of about 25% would just about let in all legitimate referrals
062         * from the highest-volume search engines (images.search.yahoo.com, images.google.com)
063         * and disallow most of the unwanted hotlinking from third-party sights.
064         * This should prove itself to be reasonably conservative,
065         * and is used if no explicit GenProps values is supplied.
066         * <p>
067         * As of 20100606 the value that had been in use for some time was 5%.
068         */
069        private static final int DEFAULT_MAX_HOTLINK_REQ_PERCENT = 5;
070    
071        /**Max (right-most) characters of putative hotlinker hostname retained; strictly positive.
072         * One less than a power of 2 so that adding a unique marker character to mark the truncation
073         * still makes for an efficient array size.
074         * <p>
075         * Empirically chosen based on noise captured by existing system as of 2011/09/19.
076         */
077        public static final int MAX_RHS_SANITISED_HOSTNAME_CHARS = 31;
078    
079        /**Max (right-most) domain components of putative hotlinker hostname retained; strictly positive.
080         * Discard left-most, less-significant sub-domain data for prune noise.
081         * <p>
082         * If at least 4 then IPv4 literals will be preserved.
083         * <p>
084         * Empirically chosen based on noise captured by existing system as of 2011/09/19.
085         */
086        public static final int MAX_RHS_SANITISED_HOSTNAME_PARTS = 4;
087    
088        /**Check if an apparent hotlinked request for an (exhibit or thumbnail) should be blocked.
089         * May record the request (even spiders / "" referrals that we will never block),
090         * and if the referrer is especially egregious will return true
091         * to suggest rejecting the request completely.
092         * A false return may still treats the request as potentially dodgy,
093         * but suggests that it be allowed to continue.
094         * <p>
095         * Absent or unparsable referrer URLs are not blocked by this mechanism
096         * (as these may be spiders or users running with "Referer" turned off for security reasons).
097         * They are dealt with in other ways.
098         * <p>
099         * Only call this where the referring URL seems dubious.
100         * <p>
101         * Algorithm for blocking requests by referrer:
102         * <ol>
103         * <li>If the host is on the "allow" list, then allow the request.
104         * <li>Else if the host is on the "block" list, then block the request.
105         * <li>If there were no hotlinks from this host yesterday,
106         *     then block the request.
107         *     The aim of this is to discourage people setting up hotlinks
108         *     to start with from their own pages since they just won't work.
109         * <li>Else if the requests from this referrer today/yesterday have amounted to
110         *     (say) 10% or more of all exhibit requests in that period, then block the request.
111         *     (Note that thumbnail requests should normally be much less common than exhibit requests
112         *     because they can be cached for a very long time.)
113         * </ol>
114         * <p>
115         * This may further sanitise and normalise the host name to maximise the signal-to-noise ratio,
116         * and reduce opportunities for malice.
117         * <p>
118         * Only package visible since only needed by servlets in this package.
119         *
120         * @param request  the incoming request; never null
121         * @param normalisedReferringHost  the normalised hotlinking host's name; never null
122         * @param vars  the DataSource; never null
123         *
124         * @return true  if this request should probably be blocked, false if request should be allowed
125         */
126        static boolean noteAndOrBlockHotlinker(final ServletContext context,
127                                               final HttpServletRequest request,
128                                               final String normalisedReferringHost,
129                                               final SimpleVariablePipelineIF vars,
130                                               final GenProps gp)
131            {
132            assert((context != null) &&
133                   (request != null) &&
134                   (normalisedReferringHost != null) &&
135                   (vars != null) &&
136                   (gp != null));
137    
138            // Immediately mark the connection as low-priority to allow us to conserve bandwidth, etc.
139            request.setAttribute("LowPriConn", Boolean.TRUE);
140    
141    //System.err.println("WARNING: seeing possible hotlink to exhibit/thumbnail from host/site: " + normalisedReferringHost);
142    
143            // We don't log or block requests with no referring URL or an unparsable one...
144            // (Otherwise we'd never let spiders in, for example.)
145            if("".equals(normalisedReferringHost))
146                { return(false); }
147    
148            // Further normalise/sanitise the host name if long for example.
149            final String sanitized = sanitiseReferrerHostName(normalisedReferringHost);
150    
151            // Now record the dubious referring URL to keep tabs on such activity...
152            // Note that the attempt is recorded even if we reject it below.
153            try
154                {
155                // Record the referring host as an event...
156                final SimpleVariableValue svv = new SimpleVariableValue(SystemVariables.ACCESSPATTERN_EX_HOTLINK_REF_HOST,
157                        sanitized);
158                vars.setVariable(svv);
159                }
160            catch(final Exception e) // Absorb any errors and continue, but whinge...
161                { e.printStackTrace(); }
162    
163    
164            // Decide whether this request should be rejected...
165    
166            // If the referring host is on our allow list then allow it immediately.
167            // This can be used to quickly deal with well-known friends.
168            if(gp.getHotLinkAllowHosts().contains(sanitized))
169                { return(false); /* Allow. */}
170            final Pattern hotLinkAllowHostsRegex = gp.getHotLinkAllowHostsRegex();
171            if((hotLinkAllowHostsRegex != null) &&
172               hotLinkAllowHostsRegex.matcher(sanitized).matches())
173                { return(false); /* Allow. */}
174    
175            // If the referring host is on disallow list then block it immediately.
176            // This can be used to quickly deal with well-known transgressors.
177            if(gp.getHotLinkDisallowHosts().contains(sanitized))
178                { return(true); /* Block. */ }
179            final Pattern hotLinkDisallowHostsRegex = gp.getHotLinkDisallowHostsRegex();
180            if((hotLinkDisallowHostsRegex != null) &&
181               hotLinkDisallowHostsRegex.matcher(sanitized).matches())
182                { return(true); /* Block. */ }
183    
184            try
185                {
186                // If there were no hotlink referrals from this host
187                // in the previous period,
188                // then we guess that this might be someone
189                // designing a new page with the hotlink to us in it,
190                // so refuse the hotlink outright so as to try to discourage them!
191                // Genuine search engines, etc, that refer every day,
192                // should stay clear of this block.
193                // This should be relatively fast to check,
194                // especially as the "previous period" events should be cached locally.
195                final int prevCount =
196                    vars.getEventValue(SystemVariables.ACCESSPATTERN_EX_HOTLINK_REF_HOST, EventPeriod.VLONG, false).
197                                    getCount(sanitized);
198                if(prevCount < 1)
199                    {
200                    context.log("WARNING: reject/divert of NEW hotlinker (sanitised host "+sanitized+") Referer: " + TextUtils.sanitiseForXML(request.getHeader("Referer"), 256, true));
201                    return(true); /* Block. */
202                    }
203    
204                // If more than the allowable percentage of all exhibit downloads
205                // have come from this referring URL in the current or previous period,
206                // then disallow this request.
207    
208                // We include the current period for a fast response to new loads even though partial/noisy.
209                // We include the previous period to give us some memory and a stable system-wide sample.
210    
211                // Total completed exhibit downloads in current/previous periods.
212                final int totalExDownloads =
213                        vars.getEventValue(SystemVariables.ACCESSPATTERN_COMPLETED_DOWNLOAD, EventPeriod.VLONG, true).
214                            getTotalEventCount() +
215                        vars.getEventValue(SystemVariables.ACCESSPATTERN_COMPLETED_DOWNLOAD, EventPeriod.VLONG, false).
216                            getTotalEventCount();
217    
218                // Total attempted exhibit and thumbnail direct referrals from same host/site as this request.
219                final int totalAttemptedReferrals =
220                        vars.getEventValue(SystemVariables.ACCESSPATTERN_EX_HOTLINK_REF_HOST, EventPeriod.VLONG, true).
221                            getCount(sanitized) +
222                        prevCount;
223    
224                // If too many referrals from this one host/site, then block this request.
225                final int rawThresholdPC = gp.getWEBSVR_EX_HOTLINK_LIMITER();
226                // A zero GenProps threshold percentage means "use the hard-coded (conservative) default".
227                final int thresholdPC = (rawThresholdPC > 0) ? rawThresholdPC : DEFAULT_MAX_HOTLINK_REQ_PERCENT;
228                final int threshold = ((totalExDownloads * thresholdPC) / 100);
229                if(totalAttemptedReferrals > threshold)
230                    { return(true); /* Block the request... */ }
231                // When close to the threshold, "brown-out" and reject some connections randomly as a warning...
232                if(totalAttemptedReferrals >= ((4*threshold)/5))
233                    {
234                    context.log("WARNING: starting to dynamically reject/divert direct exhibit hotlinks (threshold="+threshold+", "+thresholdPC+"%) with Referer: " + TextUtils.sanitiseForXML(request.getHeader("Referer"), 256, true));
235                    if(Rnd.fastRnd.nextBoolean())
236                        {
237                        return(true); /* Block the request... */
238                        }
239                    }
240                }
241            catch(final Exception e)
242                {
243                // Absorb unexpected error and let request the user's go through...
244                e.printStackTrace(); // Log the unexpected problem...
245                }
246    
247            return(false); // Let the request go through...
248            }
249    
250        /**Compiled pattern for splitting domain name into components; not null. */
251        private static final Pattern _dsp = Pattern.compile("[.]");
252    
253        /**Further normalise/sanitise the host name if long for example; never null.
254         * A sanitised domain name, if altered, is (deliberately) not itself a valid domain name.
255         * <p>
256         * Note that normally excess length is truncated on the left,
257         * but for a literal IPv6 address (starting with '[') truncation happens on the right
258         * to preserve the most significant portions,
259         */
260        public static String sanitiseReferrerHostName(final String normalisedReferringHost)
261            {
262            // Ensure lower-case if not already so.
263            String sanitized = normalisedReferringHost.toLowerCase();
264            // Slightly special handling for raw IPv6 addresses in URLs.
265            final boolean isIPv6Literal = sanitized.startsWith("[");
266    
267            // Replace all non-ASCII non-legitimate-domain-name chars with unique marker char.
268            // Allows characters for IPv6 (and IPv4) literals.
269            for(int i = sanitized.length(); --i >= 0; )
270                {
271                final char c = sanitized.charAt(i);
272                if(!((c >= 'a') && (c <= 'z')) && !((c >= '0') && (c <= '9')) &&
273                        (c != '.') && (c != '-') &&
274                        (c != ']') && (c != ':') && (c != '['))
275                    { sanitized = sanitized.replace(c, '?'); }
276                }
277    
278            // Truncate excess length (usually) from the left-hand side and prefix/suffix with unique marker char.
279            if(sanitized.length() > MAX_RHS_SANITISED_HOSTNAME_CHARS)
280                {
281                if(isIPv6Literal)
282                    { sanitized = sanitized.substring(0, MAX_RHS_SANITISED_HOSTNAME_CHARS) + "+"; }
283                else
284                    { sanitized = "+" + sanitized.substring(sanitized.length() - MAX_RHS_SANITISED_HOSTNAME_CHARS); }
285                }
286    
287            // If the residue contains excessive levels of domain/zone then prune.
288            final String[] components = _dsp.split(sanitized, 0);
289            final int nParts = components.length;
290            if(nParts > MAX_RHS_SANITISED_HOSTNAME_PARTS)
291                {
292                sanitized = '.' + components[nParts-4] + '.' + components[nParts-3] + '.' + components[nParts-2] + '.' + components[nParts-1];
293                }
294    
295            return(sanitized);
296            }
297        }