001 /*
002 Copyright (c) 1996-2011, Damon Hart-Davis
003 All rights reserved.
004
005 Redistribution and use in source and binary forms, with or without
006 modification, are permitted provided that the following conditions are
007 met:
008
009 * Redistributions of source code must retain the above copyright
010 notice, this list of conditions and the following disclaimer.
011
012 * Redistributions in binary form must reproduce the above copyright
013 notice, this list of conditions and the following disclaimer in the
014 documentation and/or other materials provided with the
015 distribution.
016
017 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
018 IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
019 TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
020 PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
021 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
022 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
023 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
024 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
025 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
026 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
027 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
028 */
029 package org.hd.d.pg2k.svrCore;
030
031 import java.io.File;
032 import java.io.FileInputStream;
033 import java.io.IOException;
034 import java.io.InputStream;
035 import java.io.InvalidObjectException;
036 import java.io.ObjectInputValidation;
037 import java.io.Serializable;
038 import java.io.StringReader;
039 import java.security.MessageDigest;
040 import java.security.NoSuchAlgorithmException;
041
042 import javax.xml.parsers.DocumentBuilder;
043 import javax.xml.parsers.DocumentBuilderFactory;
044 import javax.xml.parsers.ParserConfigurationException;
045
046 import org.w3c.dom.Document;
047 import org.w3c.dom.Element;
048 import org.w3c.dom.NamedNodeMap;
049 import org.w3c.dom.Node;
050 import org.xml.sax.InputSource;
051 import org.xml.sax.SAXException;
052
053 /**Immutable, Serializable collection of all the accession data for an exhibit.
054 * This contains all of the available accession data for an exhibit,
055 * ie data collected at or after the exhibit was added to the library,
056 * such as checksums to verify that the exhibit has not become corrupted.
057 * <p>
058 * In some cases not all fields will be complete.
059 * <p>
060 * The core fields are made available as public final fields
061 * for speed of access.
062 * <p>
063 * We do not ever expect two instances for different exhibits
064 * to be equal, so the only reason to intern() the whole object
065 * is to reduce heap churn between an extant and new/updated exhibit set.
066 * We don't do this automatically during deserialisation,
067 * and leave it up to a user of this type to do so if they wish.
068 * We <em>do not</em> intern() the fields of this object individually,
069 * to help avoid pointless space and time being taken in intern().
070 */
071 public final class AccessionData implements Serializable,
072 ObjectInputValidation,
073 MemoryTools.Internable
074 {
075 /**Top-level node name for XML representation, ie tag name. */
076 public static final String NAME_TOP_NODE = "accessionData";
077
078 /**Name of the accession date sub-node, if any. */
079 public static final String NAME_ACCESSION_DATE = "date";
080
081 /**Name of the accession-time size sub-node, if any. */
082 public static final String NAME_ACCESSION_SIZE = "size";
083
084 /**Name of the accession-time whole-exhibit CRC32 hash sub-node, if any. */
085 public static final String NAME_ACCESSION_CRC32 = "hash-CRC32";
086
087 /**Name of the accession-time whole-exhibit MD5 hash sub-node, if any. */
088 public static final String NAME_ACCESSION_MD5 = "hash-MD5";
089
090
091 /**Block size (in bytes) used for partial hashes; strictly positive power of two.
092 * This is can be used while reading in a file in a block at a time
093 * to avoid holding a lock too long,
094 * and as the size of each block of a per-block hash
095 * such as can be used to check validity of the start of
096 * a partly-cached exhibit.
097 */
098 public static final int HASH_BLOCK_SIZE_BYTES = 1024 * 1024;
099
100
101 /**Nominal time/data at which exhibit was added to the Gallery; null or strictly positive.
102 * This is null if not available,
103 * ie the accession time/date was not recorded,
104 * else it is the strictly positive Java time in milliseconds
105 * (since 19700101 00:00 UTC).
106 * <p>
107 * This can the same as, or later than, the exhibit's timestamp.
108 * If earlier than the exhibit's timestamp it may mean that the exhibit
109 * has been altered in some way.
110 */
111 public final Long date;
112
113 /**Size of exhibit file at time of accession; null or strictly positive.
114 * This is null if not available,
115 * ie the accession time/date was not recorded,
116 * else it is the strictly positive number of bytes in the file.
117 * <p>
118 * This should be identical to the current exhibit length
119 * else it may mean that the exhibit has been altered in some way.
120 */
121 public final Long size;
122
123 /**Accession-time CRC32 whole-exhibit hash or null if not available.
124 * We use CRC32 since it is commonly used elsewhere as a file checksum,
125 * for example in ZIP files.
126 */
127 public final Integer hashCRC32;
128
129 /**Accession-time MD5 whole-exhibit hash or null if not available.
130 * If non-null the array is always exactly 16 bytes long.
131 * <p>
132 * We use MD5 since it is commonly used elsewhere as a file checksum,
133 * and will in practice detect any conceivable corruption in exhibits
134 * especially given that we separately record the length.
135 */
136 public final ROByteArray hashMD5;
137
138
139 /**If true, some accession data is missing (eg null). */
140 public final boolean isIncomplete()
141 {
142 // Check all fields are non-null...
143 if(date == null) { return(true); }
144 if(size == null) { return(true); }
145 if(hashCRC32 == null) { return(true); }
146 if(hashMD5 == null) { return(true); }
147
148 return(false); // Everything is present that should be...
149 }
150
151 /**Get exhibit accession metadata as DOM, never null.
152 * This data includes such items as:
153 * <ul>
154 * <li>When the exhibit was loaded into to the Gallery.
155 * <li>Its checksums, timestamp, length, etc when initially loaded into the Gallery.
156 * </ul>
157 * <p>
158 * All this data is also accessible by (probably cheaper)
159 * direct accessor methods or fields if required.
160 */
161 public synchronized Node getAsDOM()
162 {
163 final DocumentBuilder db;
164 try { db = DocumentBuilderFactory.newInstance().newDocumentBuilder(); }
165 catch(final ParserConfigurationException e)
166 {
167 e.printStackTrace();
168 throw new Error("Cannot create DOM form of accession data", e);
169 }
170 final Document doc = db.newDocument();
171
172 // Create a provisional root node,
173 // but only assign it to the result (ie to get a non-null result)
174 // when we know that there is some real data in there.
175 final Element root = doc.createElement(NAME_TOP_NODE);
176 doc.appendChild(root);
177
178 if(date != null)
179 {
180 final Element e = doc.createElement(NAME_ACCESSION_DATE);
181 e.setAttribute("value", date.toString());
182 root.appendChild(e);
183 }
184
185 if(size != null)
186 {
187 final Element e = doc.createElement(NAME_ACCESSION_SIZE);
188 e.setAttribute("value", size.toString());
189 root.appendChild(e);
190 }
191
192 if(hashCRC32 != null)
193 {
194 final Element e = doc.createElement(NAME_ACCESSION_CRC32);
195 e.setAttribute("value", Long.toHexString(hashCRC32.intValue() & 0xffffffffL));
196 root.appendChild(e);
197 }
198
199 if(hashMD5 != null)
200 {
201 final Element e = doc.createElement(NAME_ACCESSION_MD5);
202 e.setAttribute("value", hashMD5.toHexString());
203 root.appendChild(e);
204 }
205
206 // Insert other elements here...
207
208 db.setErrorHandler(null); // Possible work-round for a memory leak.
209
210 return(root);
211 }
212
213
214 /**The hashCode is VM-independent.
215 * The size and CRC32 fields are used in the computation.
216 */
217 @Override
218 public final int hashCode()
219 {
220 int result = 539235362;
221 if(size != null) { result ^= (int) size.longValue(); }
222 if(hashCRC32 != null) { result ^= hashCRC32.intValue(); }
223 return(result);
224 }
225
226 /**All fields are compared. */
227 @Override
228 public final boolean equals(final Object o)
229 {
230 if(this == o) { return(true); }
231 if(!(o instanceof AccessionData)) { return(false); }
232 final AccessionData other = (AccessionData) o;
233
234 if(date == null) { if(other.date != null) { return(false); } }
235 else if(!date.equals(other.date)) { return(false); }
236
237 if(size == null) { if(other.size != null) { return(false); } }
238 else if(!size.equals(other.size)) { return(false); }
239
240 if(hashCRC32 == null) { if(other.hashCRC32 != null) { return(false); } }
241 else if(!hashCRC32.equals(other.hashCRC32)) { return(false); }
242
243 // Quite a lot of data to compare, so may be slow.
244 if(hashMD5 == null) { if(other.hashMD5 != null) { return(false); } }
245 else if(!hashMD5.equals(other.hashMD5)) { return(false); }
246
247 return(true); // Equal!
248 }
249
250
251
252
253 /**Construct new immutable instance with no data.
254 * Is private to help with instance control.
255 */
256 private AccessionData()
257 { this(null, null, null, null); }
258
259 /**Construct new immutable instance with the given data.
260 * @throws IllegalArgumentException in the case of bad data
261 */
262 public AccessionData(final Long accessionDate,
263 final Long accessionSize,
264 final Integer accessionCRC32,
265 final ROByteArray accessionMD5)
266 throws IllegalArgumentException
267 {
268 date = accessionDate;
269 size = accessionSize;
270 hashCRC32 = accessionCRC32;
271 hashMD5 = accessionMD5;
272
273 // Verify what we've been given.
274 try { validateObject(); }
275 catch(final InvalidObjectException e)
276 { throw new IllegalArgumentException(e); }
277 }
278
279 /**Public empty instance. */
280 public static final AccessionData EMPTY = new AccessionData();
281
282 /**Parse from XML format UNICODE text.
283 * We will attempt to be generous in our parsing,
284 * but must be able to extract the top-level node
285 * and at least one sub-ordinate node
286 * else we will assume that we have been given a bogus file.
287 *
288 * @param xml must be correct XML format and start with '<',
289 * not null, not zero-length
290 *
291 * @throws IOException in case of difficulty parsing the text
292 */
293 public static AccessionData parseFromXML(final String xml)
294 throws IOException
295 {
296 if(xml == null) { throw new IllegalArgumentException(); }
297
298 if(!xml.startsWith("<"))
299 { throw new IOException("text must start with '<'"); }
300
301 try
302 {
303 final DocumentBuilder builder = getFactoryPfx().newDocumentBuilder();
304
305 // Parse the XML input String...
306 final Document document = builder.parse(new InputSource(new StringReader(xml)));
307 final Node root = document.getFirstChild();
308
309 if(!NAME_TOP_NODE.equals(root.getNodeName()))
310 { throw new IOException("invalid top-level node: expected " + NAME_TOP_NODE); }
311
312 // Prepare to load the individual fields.
313 Long date = null;
314 Long size = null;
315 Integer hashCRC32 = null;
316 ROByteArray hashMD5 = null;
317
318 for(Node child = root.getFirstChild(); child != null; child = child.getNextSibling())
319 {
320 // Extract the "value" attribute if present.
321 String value = null;
322 final NamedNodeMap attributes = child.getAttributes();
323 if(attributes != null)
324 {
325 final Node namedItem = attributes.getNamedItem("value");
326 if(namedItem != null)
327 { value = namedItem.getNodeValue(); }
328 }
329
330 final String name = child.getNodeName();
331 if(NAME_ACCESSION_DATE.equals(name))
332 { if(value != null) { date = new Long(value); } }
333 else if(NAME_ACCESSION_SIZE.equals(name))
334 { if(value != null) { size = new Long(value); } }
335 else if(NAME_ACCESSION_CRC32.equals(name))
336 { if(value != null) { hashCRC32 = new Integer((int) Long.parseLong(value, 16)); } }
337 else if(NAME_ACCESSION_MD5.equals(name))
338 { if(value != null) { hashMD5 = ROByteArray.fromHexString(value); } }
339 else
340 { throw new IOException("unexpected node: " + name); }
341 }
342
343 final AccessionData result = new AccessionData(date, size, hashCRC32, hashMD5);
344 if(result.equals(EMPTY)) { return(EMPTY); }
345 return(result);
346 }
347 catch(final ParserConfigurationException e)
348 {
349 throw new Error("unable to configure XML parser", e);
350 }
351 catch(final SAXException e)
352 {
353 throw new IOException("problem parsing XML: " + e.getMessage());
354 }
355 }
356
357 /**Construct accession data purely from the exhibit file.
358 * This does not look at any extant or old-style accession data.
359 * <p>
360 * This does not check that the exhibit is valid except that it is
361 * a plain file and not zero-length.
362 */
363 public static AccessionData fromExhibitFile(final File f)
364 throws IOException
365 {
366 if(f == null) { throw new IllegalArgumentException(); }
367
368 final long size = f.length();
369 if(!f.exists() || !f.isFile() || !f.canRead() || (size < 1))
370 { throw new IOException("file not readable or is zero-size"); }
371
372 final long date = f.lastModified();
373 if(date <= 0)
374 { throw new IOException("invalid timestamp on file"); }
375
376 final InputStream is = new FileInputStream(f);
377 try
378 {
379 final Tuple.Pair<Integer,ROByteArray> hashes =
380 computeFullFileHashes(is);
381
382 return(new AccessionData(new Long(date),
383 Long.valueOf(size), // Optimisation: will get some small files.
384 hashes.first,
385 hashes.second));
386 }
387 finally { is.close(); }
388 }
389
390 /**Compute the full-file hashes over an exhibit; never null and neither element of the Pair null.
391 * This reads the entire InputStream and computes all the full-stream hashes
392 * at once for efficiency.
393 * <p>
394 * This does not close the InputStream.
395 * <p>
396 * The InputStream need not be buffered since we read in large efficient
397 * blocks anyway if possible.
398 *
399 * @return the CRC32 checksum as an Integer in the first element,
400 * and the MD5 hash as an ROByteArray in the second element
401 */
402 public static Tuple.Pair<Integer,ROByteArray> computeFullFileHashes(final InputStream is)
403 throws IOException
404 {
405 final java.util.zip.Checksum hCRC32 = new java.util.zip.CRC32();
406 final MessageDigest hMD5;
407 try { hMD5 = MessageDigest.getInstance(CoreConsts.HASH_MD5); }
408 catch(final NoSuchAlgorithmException e) // Should never happen...
409 { throw new Error("could not find "+CoreConsts.HASH_MD5+" digester!"); }
410
411 // Use a fairly big buffer for read efficiency.
412 final byte buf[] = new byte[1 << 16];
413 int n;
414 while((n = is.read(buf)) > 0)
415 {
416 // Update the hashes with the next block of data.
417 hCRC32.update(buf, 0, n);
418 hMD5.update(buf, 0, n);
419 }
420
421 assert(is.read() == -1); // Should be at EOF...
422
423 // Return the hashes.
424 return(new Tuple.Pair<Integer,ROByteArray>(
425 new Integer((int) hCRC32.getValue()),
426 new ROByteArray(hMD5.digest())
427 ));
428 }
429
430
431
432 /**Unique Serialisation class ID generated by http://random.hd.org/. */
433 private static final long serialVersionUID = -2699150966581906068L;
434
435 /**Deserialise: use constructor for validation, defensive copying, etc. */
436 protected Object readResolve()
437 // throws ObjectStreamException
438 {
439 // Eliminate duplicates of the empty instance.
440 if(equals(EMPTY)) { return(EMPTY); }
441
442 // Construct new instance of object in normal defensive way.
443 // This also ensures that we get a chance to intern() stuff, etc.
444 // However, we do not ever expect two instances for different exhibits
445 // to be equal, so the only reason to intern() the whole object
446 // is to reduce heap churn between an old and new/updated exhibit set.
447 return(new AccessionData(date, size, hashCRC32, hashMD5));
448 }
449
450
451 /**Validate fields/state.
452 * Called in the constructor and possibly after de-serialising.
453 * <p>
454 * Barf if something bad is found.
455 * (Maybe allow some extra info in debug version.)
456 */
457 public void validateObject()
458 throws InvalidObjectException
459 {
460 if((date != null) && !GenUtils.isValidGalleryTimestamp(date.longValue()))
461 { throw new InvalidObjectException("bad object: invalid date/timestamp "+date+ " ("+(new java.util.Date(date.longValue()))+")"); }
462 if((size != null) && (size.longValue() <= 0))
463 { throw new InvalidObjectException("bad object: non-positive size"); }
464 if((hashMD5 != null) && (hashMD5.length() != 16))
465 { throw new InvalidObjectException("bad object: incorrect size hashMD5"); }
466 }
467
468 /**Returns parser factory instance, correctly configured.
469 * Due to 'leak' concerns this is *not* statically cached.
470 */
471 private static DocumentBuilderFactory getFactoryPfx()
472 {
473 final DocumentBuilderFactory factory_pFX = DocumentBuilderFactory.newInstance();
474
475 // Configure factory_pFX.
476 factory_pFX.setValidating(false); // No validating (yet)...
477 factory_pFX.setNamespaceAware(false); // No namespaces (yet)...
478 factory_pFX.setIgnoringElementContentWhitespace(true); // Trim unnecessary whitespace.
479
480 return(factory_pFX);
481 }
482 }