001    /*
002    Copyright (c) 1996-2012, Damon Hart-Davis
003    All rights reserved.
004    
005    Redistribution and use in source and binary forms, with or without
006    modification, are permitted provided that the following conditions are
007    met:
008    
009      * Redistributions of source code must retain the above copyright
010        notice, this list of conditions and the following disclaimer.
011    
012      * Redistributions in binary form must reproduce the above copyright
013        notice, this list of conditions and the following disclaimer in the
014        documentation and/or other materials provided with the
015        distribution.
016    
017    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
018    IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
019    TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
020    PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
021    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
022    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
023    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
024    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
025    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
026    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
027    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
028    */
029    package org.hd.d.pg2k.webSvr.util;
030    
031    import java.io.BufferedInputStream;
032    import java.io.ByteArrayOutputStream;
033    import java.io.IOException;
034    import java.io.InputStream;
035    import java.io.OutputStream;
036    import java.lang.ref.SoftReference;
037    import java.net.URLConnection;
038    import java.nio.ByteBuffer;
039    import java.util.SortedMap;
040    import java.util.zip.ZipEntry;
041    import java.util.zip.ZipInputStream;
042    
043    import javax.servlet.ServletConfig;
044    import javax.servlet.http.HttpServlet;
045    import javax.servlet.http.HttpServletRequest;
046    import javax.servlet.http.HttpServletResponse;
047    
048    import org.hd.d.pg2k.svrCore.AccessionData;
049    import org.hd.d.pg2k.svrCore.AllExhibitProperties;
050    import org.hd.d.pg2k.svrCore.CoreConsts;
051    import org.hd.d.pg2k.svrCore.ExhibitPropsLoadable;
052    import org.hd.d.pg2k.svrCore.ExhibitStaticAttr;
053    import org.hd.d.pg2k.svrCore.FileTools;
054    import org.hd.d.pg2k.svrCore.FileTools.ZE;
055    import org.hd.d.pg2k.svrCore.MemoryTools;
056    import org.hd.d.pg2k.svrCore.Name;
057    import org.hd.d.pg2k.svrCore.Tuple;
058    import org.hd.d.pg2k.svrCore.MIME.ExhibitMIME;
059    import org.hd.d.pg2k.svrCore.collections.LRUMapAutoSizeForHitRate;
060    import org.hd.d.pg2k.webSvr.exhibit.DataSourceBean;
061    import org.hd.d.pg2k.webSvr.exhibit.DataSourceBean.AEPLinkedKey;
062    
063    import ORG.hd.d.IsDebug;
064    
065    /**This is the servlet that serves the content of the (latest) Gallery javadoc bundle.
066     * Essentially this does what the default WAR container support would do
067     * if the javadoc files were simply laid out in WAR as static data,
068     * though giving us the option to tune cacheing and similar behaviour.
069     * <p>
070     * We only respond to GET (and possibly HEAD) requests.
071     */
072    public final class JavadocBundleServlet extends HttpServlet
073        {
074        /**Cache key for most frequently-fetched (small) files; never null.
075         * Linked to the AEP instance to be automatically discarded when the exhibit set changes.
076         */
077        private static final DataSourceBean.AEPLinkedKey fffKey = new AEPLinkedKey("fffKey");
078    
079        /**Maximum uncompressed size in bytes of a file to be considered for the fff cache; strictly positive. */
080        private static final int MAX_FFF_BYTES = 1 << 18; // 256kB.
081    
082        /**Maximum compressed size in bytes of a file to be considered for the fff cache; strictly positive. */
083        private static final int MAX_FFF_BYTES_COMPRESSED = 1 << 13; // 8kB.
084    
085        /**Minimum number of entries ('working set') to try to hold in FFF cache; strictly positive.
086         * Much smaller than MAX_FFF_ENTRIES, even as small as (say) 10.
087         */
088        private static final int MIN_FFF_ENTRIES = 16;
089    
090        /**Maximum number of entries to hold in FFF cache; strictly positive.
091         * Much larger than MIN_FFF_ENTRIES.
092         */
093        private static final int MAX_FFF_ENTRIES = 128;
094    
095        /**Respond to a GET or HEAD request for the content served by this servlet.
096         *
097         * @param request The servlet request we are processing
098         * @param response The servlet response we are producing
099         *
100         * @exception IOException if an input/output error occurs
101         */
102        private void doAction(final HttpServletRequest request,
103                              final HttpServletResponse response,
104                              final boolean isHEAD)
105            throws IOException //, ServletException
106            {
107            final DataSourceBean dsb = getDataSource(getServletConfig(), request);
108            final Name.ExhibitFull bundleExhibitName = getBundleExhibitName(dsb);
109            if(null == bundleExhibitName)
110                { response.setStatus(HttpServletResponse.SC_SERVICE_UNAVAILABLE); return; /* Absence should be only transient. */ }
111    
112            // Get the relative request path...
113            final String rawPathInfo = request.getPathInfo();
114    //System.err.println("*** javadoc pathInfo = "+rawPathInfo);
115            // Reject any trailing slash (eg directory) or empty/null request.
116            if((rawPathInfo == null) || rawPathInfo.endsWith("/"))
117                {
118                // However, if request is for the root itself, then redirect to the index.html page.
119                if("/".equals(rawPathInfo))
120                    { response.sendRedirect("index.html"); return; }
121                // Other unwanted random directory request...
122                response.setStatus(HttpServletResponse.SC_FORBIDDEN); return; /* Bad request... */
123                }
124            // Adjust for any leading slash.
125            final String pathInfo = ((rawPathInfo != null) && rawPathInfo.startsWith("/")) ?
126                (rawPathInfo.substring(1)) : rawPathInfo;
127    
128            // Barf with 'not found' if we can't find the file in the bundle...
129            final SortedMap<CharSequence, ZE> dir = getZIPEntryOffsets(dsb);
130            if(null == dir)
131                {
132                dsb.log("ERROR: unavailable or corrupt javadoc bundle (I/O timeout or missing ZIP dir?)");
133                response.setStatus(HttpServletResponse.SC_SERVICE_UNAVAILABLE); // Absence should be only transient.
134                return;
135                }
136            final ZE entryInfo = dir.get(pathInfo);
137            if(null == entryInfo)
138                {
139    if(IsDebug.isDebug) { dsb.log("WARNING: not found in javadoc bundle: " + pathInfo); }
140                response.setStatus(HttpServletResponse.SC_NOT_FOUND);
141                return;
142                }
143            final int eLength = entryInfo.length;
144            final int eOffset = entryInfo.offset;
145    
146            final org.hd.d.pg2k.svrCore.props.GenProps gp = dsb.getGenProps(-1);
147            final AllExhibitProperties aep = dsb.getAllExhibitProperties(-1);
148    
149            // This input stream is assumed to be a light-weight wrapper
150            // that does not hold any scarce resources such as file descriptors.
151            // NOTE: if we found the content in our LRU cache then we don't create the wrapper stream.
152            final ExhibitStaticAttr esa = aep.aeid.getStaticAttr(bundleExhibitName);
153            if(null == esa)
154                {
155                dsb.log("ERROR: missing esa for "+bundleExhibitName);
156                response.setStatus(HttpServletResponse.SC_SERVICE_UNAVAILABLE); // Absence should be only transient.
157                return;
158                }
159    
160            // Use the bundle timestamp rather than the individual entry's.
161            // For consistency we'll use getLastModified(request).
162            // Must behave appropriately if this returns -1.
163            final long timestamp = esa.timestamp;
164    
165            // Look for file in our cache in case it is a frequently-requested but small one,
166            // such as the top index.html or stylesheet.
167            // We hold a thread-safe LRU cache of (compressed) small files and their ZIP descriptors
168            // as if we had just fetched them from the ZIP stream...
169            // We always keep the data in (zlib) compressed form as a raw byte[]
170            // as that is simple and does save space overall for this application.
171            // We limit the cache side to avoid contributing to memory shortage.
172            // We hold the entire cache via a SoftReference to allow automatic discard if memory runs low.
173            //
174            // The cache is parameterised/sized to be able to hold at least all of the files
175            // downloaded by a browser on visiting the opening/index page,
176            // as of 20080609 with plenty of room to spare (ie accommodating reasonable growth).
177            //
178            // Both the cache and the reference are guaranteed non-null after the loop.
179            SoftReference<MemoryTools.CacheMiniMap<String, Tuple.Pair<ZipEntry, byte[]>>> fffCacheSR;
180            MemoryTools.CacheMiniMap<String, Tuple.Pair<ZipEntry, byte[]>> fffCache;
181            // (Re)create cache if null or SoftReference has been cleared.
182            while((null == (fffCacheSR = (SoftReference<MemoryTools.CacheMiniMap<String, Tuple.Pair<ZipEntry, byte[]>>>)dsb.getAEPLinkedValue(fffKey))) ||
183                    (null == (fffCache = fffCacheSR.get())))
184                {
185                final SoftReference<MemoryTools.CacheMiniMap<String, Tuple.Pair<ZipEntry, byte[]>>> newCacheSR =
186                    new SoftReference<MemoryTools.CacheMiniMap<String,Tuple.Pair<ZipEntry,byte[]>>>(
187                            fffCache = LRUMapAutoSizeForHitRate.<String,Tuple.Pair<ZipEntry,byte[]>>create(MIN_FFF_ENTRIES, MAX_FFF_ENTRIES, fffKey.comment));
188                // Replace atomically avoiding race problems.
189                if(null == fffCacheSR) { dsb.putIfAbsentAEPLinkedValue(fffKey, newCacheSR); }
190                else { dsb.replaceAEPLinkedValue(fffKey, fffCacheSR, newCacheSR); }
191                }
192            // See if we have something in cache for this request.
193            assert((fffCacheSR != null) && (fffCache != null));
194            final Tuple.Pair<ZipEntry,byte[]> cachedValue = fffCache.get(pathInfo);
195            final boolean haveCachedEntry = (null != cachedValue);
196    //if(haveCachedEntry) { System.out.println("FOUND IN CACHE "+pathInfo+" "+cachedValue.second.length+" bytes."); }
197    
198            final InputStream is = haveCachedEntry ? null : FileTools.wrapExhibitAsStream(dsb).getInputStream(esa);
199            // Attempt to skip/seek to the start of the entry we want.
200            // We hope that this is in fact an efficient constant-time seek.
201            if(!haveCachedEntry) { is.skip(eOffset); }
202            // Buffer input to ZIP reader to aggregate small reads
203            // (which might, if the ZIP file is uncached, go as inefficient separate HTTP requests upstream)
204            // into larger single requests that should comfortably fit in a single TCP packet,
205            // thus minimising the cost/waste of any subsequent over-length read.
206            // Being uncached is quite likely due to random-access into this ZIP archive.
207            // NOTE: if we found the content in our LRU cache then we don't create the ZIP stream.
208            final ZipInputStream zis = haveCachedEntry ? null : new ZipInputStream(new BufferedInputStream(is, 1024));
209            try
210                {
211                final ZipEntry ze = haveCachedEntry ? cachedValue.first : zis.getNextEntry();
212                if((null == ze) || !pathInfo.equals(ze.getName())) // Shouldn't happen...
213                    {
214                    getServletContext().log("ERROR: could not find ZIP entry");
215                    response.setStatus(HttpServletResponse.SC_INTERNAL_SERVER_ERROR);
216                    return;
217                    }
218    
219                // Very similar HTTP header set-up to exhibits,
220                // since this data comes from an exhibit.
221                // If we know the exact length then we use it for efficiency and robustness, else -1.
222                final long llength = (eLength >= 0) ? eLength : ze.getSize(); // -1 if not known...
223                assert(llength >= -1);
224                assert(llength <= Integer.MAX_VALUE);
225                final int length = (int) llength;
226    
227                // Select the content type as if the entry was an exhibit,
228                // else try a fallback.
229                String type = null;
230                try { type = ExhibitMIME.getMIMEType(pathInfo); }
231                catch(final Exception e)
232                    {
233                    // Try to guess...
234                    type = URLConnection.guessContentTypeFromName(pathInfo);
235                    }
236    
237    
238                // Set some cacheing headers.
239                // Make the cache time usually a significant multiple of
240                // the interval between rechecks of exhibit immutable data
241                // as this is expected to change relatively slowly.
242                // Increase it if we are busy/etc so as to reduce future server load.
243                // Actually extend to a reasonable fraction of the javadoc bundle age
244                // capped to the maximum allowed for static content.
245                final long cacheLifetime = WebUtils.computeCacheMaxAgeMSFromTimestamp(timestamp, getServletContext(), gp);
246    
247                // Set the HTTP/1.1 cache-control header to reflect
248                // the maximum time that any proxy/end-user should cache the exhibit.
249                response.setHeader("Cache-Control", "public,max-age="+(cacheLifetime/1000));
250    
251                // If this might be an HTML request from a search engine
252                // then we set a long expiry so that search engines will
253                // index and retain the page; of the order of weeks or even months.
254                // This *may* force HTTP/1.0-browser users to hit RELOAD
255                // more often than we'd like...
256                final boolean isFromSpider = WebUtils.requestProbablyFromSpider(request);
257                final long expiryTime = isFromSpider
258                    ? Math.max(cacheLifetime, WebConsts.SPIDER_PAGE_EXPIRY_MS)
259                    : cacheLifetime;
260                response.setDateHeader("Expires", System.currentTimeMillis() + expiryTime);
261    
262                // Create (strong) ETag header from MD5hash if available, else no ETag.
263                final ExhibitPropsLoadable epl = aep.getExhibitPropsLoadable(esa.getExhibitFullName());
264                final AccessionData ad = epl.getAccessionMetadata();
265                final String ETag = ((ad != null) && (ad.hashMD5 != null)) ?
266                    ("\"" + ad.hashMD5.toHexString() + "\"") : null;
267                response.setHeader("ETag", ETag);
268    
269                // Set Last-Modified from known-good timestamp
270                response.setDateHeader("Last-Modified", timestamp);
271    
272    
273                // Handle If-Modified-Since / If-None-Match before setting most headers
274                // other than cache-related ones for broken clients.
275                if(WebUtils.abortIfETagMatchOrNotModifiedSince(ETag, timestamp, request, response))
276                    { return; }
277    
278    
279                // Set the main type and length headers for browsers.
280                if(length != -1) { response.setContentLength(length); }
281                if(type != null) { response.setContentType(type); }
282    
283    
284                // If this is a HEAD request then return without providing the body.
285                if(isHEAD) { return; }
286    
287    
288                // Send the data downstream...
289                final OutputStream os = response.getOutputStream();
290                if(haveCachedEntry)
291                    {
292                    // Write out data from our cached entry in one chunk for efficiency.
293                    os.write(FileTools.decompressDeflatedData(cachedValue.second));
294                    }
295                else
296                    {
297                    // Play out data from the ZIP stream.
298                    // We want to capture this in our fff cache if small enough.
299    
300                    // Set true when we decide that we won't fff cache this file.
301                    boolean tooBigToCache = (length > MAX_FFF_BYTES);
302                    // Buffer for fffCache data (we stop recording and null this out when too big).
303                    ByteArrayOutputStream baos = tooBigToCache ? null : new ByteArrayOutputStream();
304    
305                    // Buffer for transferring data...
306                    final byte[] buf = new byte[CoreConsts.BULK_DATA_TRANSFER_SIZE];
307                    // Do the transfer...
308                    for( ; ; )
309                        {
310                        final int n = zis.read(buf);
311                        if(n == -1) { break; }
312                        os.write(buf, 0, n);
313    
314                        // If still not excluded from being fff cache material then capture these bytes too.
315                        // If it is currently exactly the right size then we must capture the excess
316                        // in order to know that it is too big to cache...
317                        if(!tooBigToCache && (baos.size() <= MAX_FFF_BYTES))
318                            {
319                            if(baos.size() + n <= MAX_FFF_BYTES) { baos.write(buf, 0, n); }
320                            else
321                                {
322                                // If this would make the file too big to cache,
323                                // stop collecting immediately and discard what we have already.
324                                tooBigToCache = true;
325                                baos = null;
326                                }
327                            }
328                        }
329    
330                    // Cache this entry (compressed) if it meets our criteria...
331                    if(!tooBigToCache && (baos != null))
332                        {
333                        final byte[] compressedData = FileTools.compressDeflatableData(baos.toByteArray());
334                        // The compressed form must be smaller than its (smaller) threshold to be cached.
335                        if(compressedData.length <= MAX_FFF_BYTES_COMPRESSED)
336                            {
337                            // Cache it!
338    //System.out.println("CACHED "+pathInfo+" "+baos.size()+" bytes.");
339                            ze.setSize(baos.size()); // Store this for next time around.
340                            fffCache.put(pathInfo, new Tuple.Pair<ZipEntry,byte[]>(ze, compressedData));
341                            }
342                        }
343                    }
344                }
345            finally { if(zis != null) { zis.close(); /* Free resources. */ } }
346            }
347    
348        /**Respond to a GET request for the content served by this servlet.
349         *
350         * @param request The servlet request we are processing
351         * @param response The servlet response we are producing
352         *
353         * @exception IOException if an input/output error occurs
354         */
355        @Override
356        public void doGet(final HttpServletRequest request,
357                          final HttpServletResponse response)
358            throws IOException //, ServletException
359            {
360            doAction(request, response, false);
361            }
362    
363        /**Respond to a HEAD request for the content served by this servlet.
364         *
365         * @param request The servlet request we are processing
366         * @param response The servlet response we are producing
367         *
368         * @exception IOException if an input/output error occurs
369         */
370        @Override
371        public void doHead(final HttpServletRequest request,
372                           final HttpServletResponse response)
373            throws IOException // , ServletException
374            {
375            doAction(request, response, true);
376            }
377    
378        /**Get `last-modified' time for the entire bundle.
379         * If we can't find out then we return -1L, the default value.
380         * <p>
381         * If returning a last-modified value causes difficulty
382         * (eg with Tomcat 4.0.1) then we return -1.
383         *
384         * @param request The servlet request we are processing
385         */
386        @Override
387        public final long getLastModified(final HttpServletRequest request)
388            {
389    //        if(WebConsts.AVOID_LAST_MODIFIED) { return(-1L); }
390    
391            final DataSourceBean dsb = getDataSource(getServletConfig(), request);
392            final Name.ExhibitFull bundleExhibitName = getBundleExhibitName(dsb);
393            try
394                {
395                if(null != bundleExhibitName)
396                    {
397                    final ExhibitStaticAttr esa = dsb.getStaticAttr(bundleExhibitName);
398                    return(esa.timestamp);
399                    }
400                }
401            catch(final Exception e) { /* Fall through in case of error... */ }
402    
403            return(-1); // Don't know.
404            }
405    
406        /**Get singleton (per-servlet-context) data pipeline/cache instance.
407         * The config param must not be null, but for some operations
408         * (such as calling destroy()) request can be null.
409         */
410        private static DataSourceBean getDataSource(
411                    final ServletConfig config,
412                    final HttpServletRequest request)
413            {
414            // Fetches/creates the data source...
415            final DataSourceBean dataSource =
416                DataSourceBean.getApplicationInstance(config.getServletContext());
417    
418            return(dataSource);
419            }
420    
421        /**Get full exhibit name of the latest javadoc bundle; null if none. */
422        private static final Name.ExhibitFull getBundleExhibitName(final DataSourceBean dsb)
423            { return(WebUtils.findLatestCodeBundle(dsb, WebConsts.PREFIX_JAVADOC_BUNDLE)); }
424    
425        /**Private key for cache of ZIP directory against current AEP instance; never null. */
426        private static final AEPLinkedKey dirKey = new AEPLinkedKey("dirKey");
427    
428        /**Get ZIP directory/offsets; null if none.
429         * The keys of the returned (immutable) map are valid files/entries in the ZIP.
430         * <p>
431         * Each value is the (non-negative) offset from the start of the ZIP file
432         * to the start of its entry as can be read with ZipInputStream.
433         * <p>
434         * This is cached against the AEP instance for efficiency.
435         * (We might have to go back across the network to the master to fetch this
436         * if we don't have the entire bundle cached locally, for example, which is slow.)
437         * <p>
438         * On successfully cacheing this we read the first byte of the archive
439         * to try to trigger (pre)cacheing of at least an enclosing early chunk.
440         */
441        private static SortedMap<CharSequence, ZE> getZIPEntryOffsets(final DataSourceBean dsb)
442            {
443            // Try for this from cache...
444            SortedMap<CharSequence, ZE> result = (SortedMap<CharSequence, ZE>) dsb.getAEPLinkedValue(dirKey);
445            if(result != null) { return(result); }
446    
447            final Name.ExhibitFull exhibitName = getBundleExhibitName(dsb);
448            if(exhibitName == null) { return(null); }
449            try { result = FileTools.getZIPEntriesLengthAndOffset(FileTools.wrapExhibitAsRandomAccessData(dsb, exhibitName)); }
450            catch(final Exception e) { dsb.log(e.getMessage()); return(null); }
451    
452            // Save (positive) result in cache.
453            if(result != null)
454                {
455                if(null == dsb.putIfAbsentAEPLinkedValue(dirKey, result))
456                    {
457                    // We just won the race to cache the ZIP directory
458                    // so now try to also force-load the ZIP start with a minimal initial read.
459                    try { dsb.getRawFile(ByteBuffer.allocate(1), exhibitName, 0, false); }
460                    catch(final IOException e) { dsb.log("Precache attempt on javadoc ZIP threw exception: "+e.getMessage()); }
461                    }
462                }
463    
464            return(result);
465            }
466    
467        /**Unique serialisation ID. */
468        private static final long serialVersionUID = 8287758876065863228L;
469        }