001    /*
002    Copyright (c) 1996-2011, Damon Hart-Davis
003    All rights reserved.
004    
005    Redistribution and use in source and binary forms, with or without
006    modification, are permitted provided that the following conditions are
007    met:
008    
009      * Redistributions of source code must retain the above copyright
010        notice, this list of conditions and the following disclaimer.
011    
012      * Redistributions in binary form must reproduce the above copyright
013        notice, this list of conditions and the following disclaimer in the
014        documentation and/or other materials provided with the
015        distribution.
016    
017    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
018    IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
019    TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
020    PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
021    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
022    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
023    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
024    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
025    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
026    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
027    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
028    */
029    package org.hd.d.pg2k.svrCore;
030    
031    import java.io.File;
032    import java.io.FileInputStream;
033    import java.io.IOException;
034    import java.io.InputStream;
035    import java.io.InvalidObjectException;
036    import java.io.ObjectInputValidation;
037    import java.io.Serializable;
038    import java.io.StringReader;
039    import java.security.MessageDigest;
040    import java.security.NoSuchAlgorithmException;
041    
042    import javax.xml.parsers.DocumentBuilder;
043    import javax.xml.parsers.DocumentBuilderFactory;
044    import javax.xml.parsers.ParserConfigurationException;
045    
046    import org.w3c.dom.Document;
047    import org.w3c.dom.Element;
048    import org.w3c.dom.NamedNodeMap;
049    import org.w3c.dom.Node;
050    import org.xml.sax.InputSource;
051    import org.xml.sax.SAXException;
052    
053    /**Immutable, Serializable collection of all the accession data for an exhibit.
054     * This contains all of the available accession data for an exhibit,
055     * ie data collected at or after the exhibit was added to the library,
056     * such as checksums to verify that the exhibit has not become corrupted.
057     * <p>
058     * In some cases not all fields will be complete.
059     * <p>
060     * The core fields are made available as public final fields
061     * for speed of access.
062     * <p>
063     * We do not ever expect two instances for different exhibits
064     * to be equal, so the only reason to intern() the whole object
065     * is to reduce heap churn between an extant and new/updated exhibit set.
066     * We don't do this automatically during deserialisation,
067     * and leave it up to a user of this type to do so if they wish.
068     * We <em>do not</em> intern() the fields of this object individually,
069     * to help avoid pointless space and time being taken in intern().
070     */
071    public final class AccessionData implements Serializable,
072                                                ObjectInputValidation,
073                                                MemoryTools.Internable
074        {
075        /**Top-level node name for XML representation, ie tag name. */
076        public static final String NAME_TOP_NODE = "accessionData";
077    
078        /**Name of the accession date sub-node, if any. */
079        public static final String NAME_ACCESSION_DATE = "date";
080    
081        /**Name of the accession-time size sub-node, if any. */
082        public static final String NAME_ACCESSION_SIZE = "size";
083    
084        /**Name of the accession-time whole-exhibit CRC32 hash sub-node, if any. */
085        public static final String NAME_ACCESSION_CRC32 = "hash-CRC32";
086    
087        /**Name of the accession-time whole-exhibit MD5 hash sub-node, if any. */
088        public static final String NAME_ACCESSION_MD5 = "hash-MD5";
089    
090    
091        /**Block size (in bytes) used for partial hashes; strictly positive power of two.
092         * This is can be used while reading in a file in a block at a time
093         * to avoid holding a lock too long,
094         * and as the size of each block of a per-block hash
095         * such as can be used to check validity of the start of
096         * a partly-cached exhibit.
097         */
098        public static final int HASH_BLOCK_SIZE_BYTES = 1024 * 1024;
099    
100    
101        /**Nominal time/data at which exhibit was added to the Gallery; null or strictly positive.
102         * This is null if not available,
103         * ie the accession time/date was not recorded,
104         * else it is the strictly positive Java time in milliseconds
105         * (since 19700101 00:00 UTC).
106         * <p>
107         * This can the same as, or later than, the exhibit's timestamp.
108         * If earlier than the exhibit's timestamp it may mean that the exhibit
109         * has been altered in some way.
110         */
111        public final Long date;
112    
113        /**Size of exhibit file at time of accession; null or strictly positive.
114         * This is null if not available,
115         * ie the accession time/date was not recorded,
116         * else it is the strictly positive number of bytes in the file.
117         * <p>
118         * This should be identical to the current exhibit length
119         * else it may mean that the exhibit has been altered in some way.
120         */
121        public final Long size;
122    
123        /**Accession-time CRC32 whole-exhibit hash or null if not available.
124         * We use CRC32 since it is commonly used elsewhere as a file checksum,
125         * for example in ZIP files.
126         */
127        public final Integer hashCRC32;
128    
129        /**Accession-time MD5 whole-exhibit hash or null if not available.
130         * If non-null the array is always exactly 16 bytes long.
131         * <p>
132         * We use MD5 since it is commonly used elsewhere as a file checksum,
133         * and will in practice detect any conceivable corruption in exhibits
134         * especially given that we separately record the length.
135         */
136        public final ROByteArray hashMD5;
137    
138    
139        /**If true, some accession data is missing (eg null). */
140        public final boolean isIncomplete()
141            {
142            // Check all fields are non-null...
143            if(date == null) { return(true); }
144            if(size == null) { return(true); }
145            if(hashCRC32 == null) { return(true); }
146            if(hashMD5 == null) { return(true); }
147    
148            return(false); // Everything is present that should be...
149            }
150    
151        /**Get exhibit accession metadata as DOM, never null.
152         * This data includes such items as:
153         * <ul>
154         * <li>When the exhibit was loaded into to the Gallery.
155         * <li>Its checksums, timestamp, length, etc when initially loaded into the Gallery.
156         * </ul>
157         * <p>
158         * All this data is also accessible by (probably cheaper)
159         * direct accessor methods or fields if required.
160         */
161        public synchronized Node getAsDOM()
162            {
163            final DocumentBuilder db;
164            try { db = DocumentBuilderFactory.newInstance().newDocumentBuilder(); }
165            catch(final ParserConfigurationException e)
166                {
167                e.printStackTrace();
168                throw new Error("Cannot create DOM form of accession data", e);
169                }
170            final Document doc = db.newDocument();
171    
172            // Create a provisional root node,
173            // but only assign it to the result (ie to get a non-null result)
174            // when we know that there is some real data in there.
175            final Element root = doc.createElement(NAME_TOP_NODE);
176            doc.appendChild(root);
177    
178            if(date != null)
179                {
180                final Element e = doc.createElement(NAME_ACCESSION_DATE);
181                e.setAttribute("value", date.toString());
182                root.appendChild(e);
183                }
184    
185            if(size != null)
186                {
187                final Element e = doc.createElement(NAME_ACCESSION_SIZE);
188                e.setAttribute("value", size.toString());
189                root.appendChild(e);
190                }
191    
192            if(hashCRC32 != null)
193                {
194                final Element e = doc.createElement(NAME_ACCESSION_CRC32);
195                e.setAttribute("value", Long.toHexString(hashCRC32.intValue() & 0xffffffffL));
196                root.appendChild(e);
197                }
198    
199            if(hashMD5 != null)
200                {
201                final Element e = doc.createElement(NAME_ACCESSION_MD5);
202                e.setAttribute("value", hashMD5.toHexString());
203                root.appendChild(e);
204                }
205    
206            // Insert other elements here...
207    
208            db.setErrorHandler(null); // Possible work-round for a memory leak.
209    
210            return(root);
211            }
212    
213    
214        /**The hashCode is VM-independent.
215         * The size and CRC32 fields are used in the computation.
216         */
217        @Override
218            public final int hashCode()
219            {
220            int result = 539235362;
221            if(size != null) { result ^= (int) size.longValue(); }
222            if(hashCRC32 != null) { result ^= hashCRC32.intValue(); }
223            return(result);
224            }
225    
226        /**All fields are compared. */
227        @Override
228            public final boolean equals(final Object o)
229            {
230            if(this == o) { return(true); }
231            if(!(o instanceof AccessionData)) { return(false); }
232            final AccessionData other = (AccessionData) o;
233    
234            if(date == null) { if(other.date != null) { return(false); } }
235            else if(!date.equals(other.date)) { return(false); }
236    
237            if(size == null) { if(other.size != null) { return(false); } }
238            else if(!size.equals(other.size)) { return(false); }
239    
240            if(hashCRC32 == null) { if(other.hashCRC32 != null) { return(false); } }
241            else if(!hashCRC32.equals(other.hashCRC32)) { return(false); }
242    
243            // Quite a lot of data to compare, so may be slow.
244            if(hashMD5 == null) { if(other.hashMD5 != null) { return(false); } }
245            else if(!hashMD5.equals(other.hashMD5)) { return(false); }
246    
247            return(true); // Equal!
248            }
249    
250    
251    
252    
253        /**Construct new immutable instance with no data.
254         * Is private to help with instance control.
255         */
256        private AccessionData()
257            { this(null, null, null, null); }
258    
259        /**Construct new immutable instance with the given data.
260         * @throws IllegalArgumentException  in the case of bad data
261         */
262        public AccessionData(final Long accessionDate,
263                             final Long accessionSize,
264                             final Integer accessionCRC32,
265                             final ROByteArray accessionMD5)
266            throws IllegalArgumentException
267            {
268            date = accessionDate;
269            size = accessionSize;
270            hashCRC32 = accessionCRC32;
271            hashMD5 = accessionMD5;
272    
273            // Verify what we've been given.
274            try { validateObject(); }
275            catch(final InvalidObjectException e)
276                { throw new IllegalArgumentException(e); }
277            }
278    
279        /**Public empty instance. */
280        public static final AccessionData EMPTY = new AccessionData();
281    
282        /**Parse from XML format UNICODE text.
283         * We will attempt to be generous in our parsing,
284         * but must be able to extract the top-level node
285         * and at least one sub-ordinate node
286         * else we will assume that we have been given a bogus file.
287         *
288         * @param xml  must be correct XML format and start with '&lt;',
289         *     not null, not zero-length
290         *
291         * @throws IOException  in case of difficulty parsing the text
292         */
293        public static AccessionData parseFromXML(final String xml)
294            throws IOException
295            {
296            if(xml == null) { throw new IllegalArgumentException(); }
297    
298            if(!xml.startsWith("<"))
299                { throw new IOException("text must start with '<'"); }
300    
301            try
302                {
303                final DocumentBuilder builder = getFactoryPfx().newDocumentBuilder();
304    
305                // Parse the XML input String...
306                final Document document = builder.parse(new InputSource(new StringReader(xml)));
307                final Node root = document.getFirstChild();
308    
309                if(!NAME_TOP_NODE.equals(root.getNodeName()))
310                    { throw new IOException("invalid top-level node: expected " + NAME_TOP_NODE); }
311    
312                // Prepare to load the individual fields.
313                Long date = null;
314                Long size = null;
315                Integer hashCRC32 = null;
316                ROByteArray hashMD5 = null;
317    
318                for(Node child = root.getFirstChild(); child != null; child = child.getNextSibling())
319                    {
320                    // Extract the "value" attribute if present.
321                    String value = null;
322                    final NamedNodeMap attributes = child.getAttributes();
323                    if(attributes != null)
324                        {
325                        final Node namedItem = attributes.getNamedItem("value");
326                        if(namedItem != null)
327                            { value = namedItem.getNodeValue(); }
328                        }
329    
330                    final String name = child.getNodeName();
331                    if(NAME_ACCESSION_DATE.equals(name))
332                        { if(value != null) { date = new Long(value); } }
333                    else if(NAME_ACCESSION_SIZE.equals(name))
334                        { if(value != null) { size = new Long(value); } }
335                    else if(NAME_ACCESSION_CRC32.equals(name))
336                        { if(value != null) { hashCRC32 = new Integer((int) Long.parseLong(value, 16)); } }
337                    else if(NAME_ACCESSION_MD5.equals(name))
338                        { if(value != null) { hashMD5 = ROByteArray.fromHexString(value); } }
339                    else
340                        { throw new IOException("unexpected node: " + name); }
341                    }
342    
343                final AccessionData result = new AccessionData(date, size, hashCRC32, hashMD5);
344                if(result.equals(EMPTY)) { return(EMPTY); }
345                return(result);
346                }
347            catch(final ParserConfigurationException e)
348                {
349                throw new Error("unable to configure XML parser", e);
350                }
351            catch(final SAXException e)
352                {
353                throw new IOException("problem parsing XML: " + e.getMessage());
354                }
355            }
356    
357        /**Construct accession data purely from the exhibit file.
358         * This does not look at any extant or old-style accession data.
359         * <p>
360         * This does not check that the exhibit is valid except that it is
361         * a plain file and not zero-length.
362         */
363        public static AccessionData fromExhibitFile(final File f)
364            throws IOException
365            {
366            if(f == null) { throw new IllegalArgumentException(); }
367    
368            final long size = f.length();
369            if(!f.exists() || !f.isFile() || !f.canRead() || (size < 1))
370                { throw new IOException("file not readable or is zero-size"); }
371    
372            final long date = f.lastModified();
373            if(date <= 0)
374                { throw new IOException("invalid timestamp on file"); }
375    
376            final InputStream is = new FileInputStream(f);
377            try
378                {
379                final Tuple.Pair<Integer,ROByteArray> hashes =
380                    computeFullFileHashes(is);
381    
382                return(new AccessionData(new Long(date),
383                                         Long.valueOf(size), // Optimisation: will get some small files.
384                                         hashes.first,
385                                         hashes.second));
386                }
387            finally { is.close(); }
388            }
389    
390        /**Compute the full-file hashes over an exhibit; never null and neither element of the Pair null.
391         * This reads the entire InputStream and computes all the full-stream hashes
392         * at once for efficiency.
393         * <p>
394         * This does not close the InputStream.
395         * <p>
396         * The InputStream need not be buffered since we read in large efficient
397         * blocks anyway if possible.
398         *
399         * @return the CRC32 checksum as an Integer in the first element,
400         *         and the MD5 hash as an ROByteArray in the second element
401         */
402        public static Tuple.Pair<Integer,ROByteArray> computeFullFileHashes(final InputStream is)
403            throws IOException
404            {
405            final java.util.zip.Checksum hCRC32 = new java.util.zip.CRC32();
406            final MessageDigest hMD5;
407            try { hMD5 = MessageDigest.getInstance(CoreConsts.HASH_MD5); }
408            catch(final NoSuchAlgorithmException e) // Should never happen...
409                { throw new Error("could not find "+CoreConsts.HASH_MD5+" digester!"); }
410    
411            // Use a fairly big buffer for read efficiency.
412            final byte buf[] = new byte[1 << 16];
413            int n;
414            while((n = is.read(buf)) > 0)
415                {
416                // Update the hashes with the next block of data.
417                hCRC32.update(buf, 0, n);
418                hMD5.update(buf, 0, n);
419                }
420    
421            assert(is.read() == -1); // Should be at EOF...
422    
423            // Return the hashes.
424            return(new Tuple.Pair<Integer,ROByteArray>(
425                new Integer((int) hCRC32.getValue()),
426                new ROByteArray(hMD5.digest())
427                ));
428            }
429    
430    
431    
432        /**Unique Serialisation class ID generated by http://random&#46;hd&#46;org/. */
433        private static final long serialVersionUID = -2699150966581906068L;
434    
435        /**Deserialise: use constructor for validation, defensive copying, etc. */
436        protected Object readResolve()
437            // throws ObjectStreamException
438            {
439            // Eliminate duplicates of the empty instance.
440            if(equals(EMPTY)) { return(EMPTY); }
441    
442            // Construct new instance of object in normal defensive way.
443            // This also ensures that we get a chance to intern() stuff, etc.
444            // However, we do not ever expect two instances for different exhibits
445            // to be equal, so the only reason to intern() the whole object
446            // is to reduce heap churn between an old and new/updated exhibit set.
447            return(new AccessionData(date, size, hashCRC32, hashMD5));
448            }
449    
450    
451        /**Validate fields/state.
452         * Called in the constructor and possibly after de-serialising.
453         * <p>
454         * Barf if something bad is found.
455         * (Maybe allow some extra info in debug version.)
456         */
457        public void validateObject()
458            throws InvalidObjectException
459            {
460            if((date != null) && !GenUtils.isValidGalleryTimestamp(date.longValue()))
461                { throw new InvalidObjectException("bad object: invalid date/timestamp "+date+ " ("+(new java.util.Date(date.longValue()))+")"); }
462            if((size != null) && (size.longValue() <= 0))
463                { throw new InvalidObjectException("bad object: non-positive size"); }
464            if((hashMD5 != null) && (hashMD5.length() != 16))
465                { throw new InvalidObjectException("bad object: incorrect size hashMD5"); }
466            }
467    
468        /**Returns parser factory instance, correctly configured.
469         * Due to 'leak' concerns this is *not* statically cached.
470         */
471        private static DocumentBuilderFactory getFactoryPfx()
472            {
473            final DocumentBuilderFactory factory_pFX = DocumentBuilderFactory.newInstance();
474    
475            // Configure factory_pFX.
476            factory_pFX.setValidating(false); // No validating (yet)...
477            factory_pFX.setNamespaceAware(false); // No namespaces (yet)...
478            factory_pFX.setIgnoringElementContentWhitespace(true); // Trim unnecessary whitespace.
479    
480            return(factory_pFX);
481            }
482        }