001    /*
002    Copyright (c) 1996-2012, Damon Hart-Davis
003    All rights reserved.
004    
005    Redistribution and use in source and binary forms, with or without
006    modification, are permitted provided that the following conditions are
007    met:
008    
009      * Redistributions of source code must retain the above copyright
010        notice, this list of conditions and the following disclaimer.
011    
012      * Redistributions in binary form must reproduce the above copyright
013        notice, this list of conditions and the following disclaimer in the
014        documentation and/or other materials provided with the
015        distribution.
016    
017    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
018    IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
019    TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
020    PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
021    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
022    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
023    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
024    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
025    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
026    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
027    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
028    */
029    package org.hd.d.pg2k.svrCore;
030    
031    import java.util.Collections;
032    import java.util.Comparator;
033    import java.util.Enumeration;
034    import java.util.Set;
035    import java.util.SortedSet;
036    import java.util.StringTokenizer;
037    import java.util.TreeSet;
038    
039    /**Utility routines to validate/parse an exhibit name as a String/CharSequence.
040     * An exhibit name is a (relative) path name in a filesystem representation,
041     * and a (relative) URL in a Web presentation.
042     * <p>
043     * This is based on the assumption that we use the String name
044     * everywhere as a sort of universal currency, rather than pre-parsing
045     * everything as in the old Attributes.ItemName.
046     * <p>
047     * The syntax of the file name is:
048     * <ol>
049     * <li>Printable 7-bit ASCII.
050     * <li>A category directory name (consisting of characters from the
051     *     range [a-z-_] starting with a character in the range [a-z]
052     * <li>Zero or more directory components of the form _more[0-9A-Z]*
053     * <li>A final unique `file' name consisting of hyphen-separated
054     *     words ending with a recognised extension that indicates the
055     *     MIME type of the underlying file; full details of the syntax
056     *     of this part are given below.
057     * </ol>
058     * <p>
059     * The syntax of a Gallery image name is as follows:
060     *     {word-}+{discardableword-}*[number-]AUTH.ext
061     * where word is an alphanumeric sequence
062     * containing at least one letter (there must
063     * be at least one such word in the name),
064     * discardableword is like word in syntax but
065     * comes from a small list of words that can
066     * trail the main image description and indicate
067     * some gross features of the image, eg `bg'
068     * and `mono' (these words are optional and
069     * will only be recognised as discardable
070     * if after all non-discardable words),
071     * number is a optional decimal number consisting
072     * purely of the digits 0-9 (with any leading
073     * zeros ignored rather than indicating octal),
074     * AUTH being the all-upper-case all-alpha
075     * author's initials (this is compulsory), and
076     * ext being the extension indicating the image
077     * type (this is compulsory).
078     * <p>
079     * Note that other than the extension, all components
080     * are delimited by dashes, and there must be no spaces
081     * in the name.  The extension may contain dots, and
082     * dots are not allowed elsewhere in the name.
083     * <p>
084     * For the purposes of sorting, the sort order
085     * is first by the [word-]+ portion (ASCII order),
086     * then by the author (ASCII order),
087     * then by the number portion (numerically),
088     * then by the {discardableword-}* portion (ASCII order),
089     * then by the extension (ASCII order).
090     */
091    public final class ExhibitName
092        {
093        /**The character used to separate words. */
094        public static final char WORD_SEP = '-';
095        /**The single character used to separate words as a String value for convenience. */
096        public static final String WORD_SEPS = "-";
097    
098        /**The character used to separate directory components. */
099        public static final char DIR_SEP = '/';
100    
101        /**Maximum valid name length.
102         * This is basically limited by maximum URL and UNIX-filename length,
103         * and the fact that we need some "decoration" overhead of exhibit names
104         * when embedded in filenames, URLs, etc.
105         */
106        public static final int MAX_NAME_LENGTH = 1023 - 128;
107    
108        /**Minimum length of file component of any valid name.
109         * Such a minimal name must be of the form ``a-A.a''.
110         * In practice any real name must be longer since we will
111         * not have one-character author initials nor file extensions,
112         * but this will help quickly discard directory entries
113         * such as "." and ".." for example.
114         */
115        public static final int MIN_FILENAME_LENGTH = 5;
116    
117        /**Minimum valid name length.
118         * Of form "a/a-A.a".
119         */
120        public static final int MIN_NAME_LENGTH = 2 + MIN_FILENAME_LENGTH;
121    
122        /**Maximum allowable length of any single word.
123         * This is where the name is of the form a/word-A.a.
124         */
125        public static final int MAX_WORD_LENGTH = MAX_NAME_LENGTH - 6;
126    
127        /**Maximum allowable length of any name stem (ie just main words).
128         * This is where the name is of the form a/stem-A.a
129         * where the stem is one or more words.
130         */
131        public static final int MAX_STEM_LENGTH = MAX_WORD_LENGTH;
132    
133        /* Maximum allowable attribute word length.
134         * Maximum attribute word must be at most long enough to allow for
135         * a single-letter main word and dash,
136         * so is two less than the longest allowable main word.
137         * This is where the name is of the form ``a/a-attrword-A.a''.
138         */
139        public static final int MAX_ATTR_WORD_LENGTH = MAX_WORD_LENGTH - 2;
140    
141        /**Prefix of intermediate directory components in name. */
142        public static final String intermediateDirPrefix = "_more";
143        /**Length of prefix of intermediate directory components in name. */
144        private static final int _iDirPrefixLen = 5;
145    
146        /**Minimum length of author-initial component; strictly positive. */
147        public static final int MIN_AUTH_INITIALS_LENGTH = 1;
148    
149        /**Maximum length of author-initial component. */
150        public static final int MAX_AUTH_INITIALS_LENGTH = 8;
151    
152        /**Validates the syntax of the first component of a name; returns true if valid.
153         * This is helpful to identify the roots of a directory scan, for example.
154         * <p>
155         * This ignores any characters from len onwards, so the portion from
156         * 0--len-1 must be a complete valid initial directory, and the string
157         * passed must be at least len characters long.
158         * <p>
159         * Designed to be efficiently callable from validNameSyntax() without
160         * requiring creation of any objects.
161         */
162        public static boolean validNameInitialComponentSyntax(final CharSequence initialNameComponent,
163                                                              final int len)
164            {
165            // DO INITIAL SET OF VERY QUICK TESTS.
166            //Assert(initialNameComponent != null);
167            //Assert(len <= initialNameComponent.length());
168            if(len < 1) // Shortest possible name ``a''.
169                { return(false); }
170            // Allow room for a minimal name tail.
171            if(len > MAX_NAME_LENGTH - (MIN_FILENAME_LENGTH-1))
172                { return(false); }
173    
174    //        // Make sure that this is a pure single dir component; no path.
175    //        // Note that this is implicit in the following tests.
176    //        if(-1 != initialNameComponent.lastIndexOf(DIR_SEP, len-1)) { return(false); }
177    
178            // First character of (directory component of) name
179            // must be lower-case (ASCII) letter.
180            final char firstChar = initialNameComponent.charAt(0);
181            if((firstChar < 'a') || (firstChar > 'z'))
182                { return(false); }
183    
184            // Now check the first dir/category component fully,
185            // from after the first char.
186            // Just lower-case letters and dash or underscore are permitted.
187            for(int i = len; --i > 0; )
188                {
189                final char c = initialNameComponent.charAt(i);
190                if(((c < 'a') || (c > 'z')) &&
191                   (c != WORD_SEP) &&
192                   (c != '_'))
193                    { return(false); }
194                }
195    
196            return(true); // Initial dir component OK!
197            }
198    
199        /**Validates a set of author's initials for syntax; returns true if valid.
200         * This version examines part of a string.
201         * <p>
202         * Characters from start to end-1 are checked.
203         * <p>
204         * The start and end positions must be valid in the CharSequence passed and start
205         * start must come before end.
206         * <p>
207         * Returns false if the string is null.
208         *
209         * @param s  the value to be examined
210         * @param start  the starting position of the author;
211         *     must be valid offset within string
212         * @param end  just after the end of the author;
213         *     must be greater than start and no greater than the string length
214         */
215        public static boolean validAuthorSyntax(final CharSequence s,
216                                                final int start,
217                                                final int end)
218            {
219            if(s == null)
220                { return(false); }
221    
222            if((start < 0) || (start > end) || (end > s.length()))
223                { throw new IllegalArgumentException(); }
224    
225            final int len = end - start;
226            if(len < MIN_AUTH_INITIALS_LENGTH) { return(false); }
227            if(len > MAX_AUTH_INITIALS_LENGTH) { return(false); }
228    
229            // The auth initials must run from start up to just before end
230            // and must be upper-case letters.
231            for(int i = end; --i >= start; )
232                {
233                final char c = s.charAt(i);
234                if((c < 'A') || (c > 'Z'))
235                    { return(false); } // Invalid author initial.
236                }
237    
238            // Seems OK.
239            return(true);
240            }
241    
242         /**Validates a set of author's initials for syntax; returns true if valid.
243          * The whole string must be a valid author ID (and not null).
244          *
245          * @return true iff s is not null and is a valid set of author initials
246          */
247         public static boolean validAuthorSyntax(final CharSequence s)
248             { return((s != null) && validAuthorSyntax(s, 0, s.length())); }
249    
250    
251        /**Checks that the CharSequence passed to it is a valid word (main or attribute).
252         * This means it must be non-zero length (and non-null),
253         * and consist only of letters and digits.
254         */
255        public static boolean validWord(final CharSequence s)
256            {
257            if((s == null) ||
258                (s.length() == 0) || (s.length() > MAX_WORD_LENGTH))
259                { return(false); }
260    
261            // Make sure that all characters are individually acceptable...
262            for(int i = s.length(); --i >= 0; )
263                {
264                if(!validWordCharacter(s.charAt(i)))
265                    { return(false); }
266                }
267    
268            return(true); // Seems OK...
269            }
270    
271    
272        /**Checks that the CharSequence passed to it is a valid attribute word.
273         * This means it must be non-zero length (and non-null),
274         * and consist only of letters and digits
275         * and must not consist entirely of digits or upper-case letters
276         * (to avoid ambiguity with the number-in-series value and author).
277         * <p>
278         * Maximum attribute word must be at most long enough to allow for
279         * a single-letter main word and dash,
280         * so is two less than the longest allowable main word.
281         */
282        public static boolean validAttributeWord(final CharSequence s)
283            {
284            if((s == null) ||
285                (s.length() == 0) || (s.length() > MAX_ATTR_WORD_LENGTH))
286                { return(false); }
287    
288            // Make sure that the word is valid in itself...
289            if(!validWord(s)) { return(false); }
290    
291            final char firstChar = s.charAt(0);
292            if((firstChar < '0') || (firstChar > '9'))
293                {
294                // Make sure that word is not entirely digits.
295                for(int i = s.length(); --i >= 0; )
296                    {
297                    final char c = s.charAt(i);
298                    if((c < '0') || (c > '9'))
299                        { return(true); } // Not purely numerical...
300                    }
301                return(false); // Whoops, was only digits.
302                }
303            else if((firstChar < 'A') || (firstChar > 'Z'))
304                {
305                // Make sure that word is not upper-case.
306                for(int i = s.length(); --i >= 0; )
307                    {
308                    final char c = s.charAt(i);
309                    if((c < 'A') || (c > 'Z'))
310                        { return(true); } // Not purely numerical...
311                    }
312                return(false); // Whoops, was only digits.
313                }
314    
315            return(true); // Seems OK...
316            }
317    
318        /**Validates the syntax of the first component of a name; returns true if valid.
319         * This is helpful to identify the roots of a directory scan, for example.
320         */
321        public static boolean validNameInitialComponentSyntax(final CharSequence initialNameComponent)
322            {
323            if(initialNameComponent == null)
324                { return(false); }
325            return(validNameInitialComponentSyntax(initialNameComponent, initialNameComponent.length()));
326            }
327    
328        /**Returns true if the character passed is a valid word character.
329         * A valid word character is an ASCII digit or letter (either case).
330         * <p>
331         * We test the most common cases first for speed.
332         */
333        public static boolean validWordCharacter(final char c)
334            {
335            return(((c >= 'a') && (c <= 'z')) || // Most commonly lower-case.
336                   ((c >= 'A') && (c <= 'Z')) ||
337                   ((c >= '0') && (c <= '9')));
338            }
339    
340        /**Validates the syntax of the last component of a name; returns true if valid.
341         * This is helpful when running a directory scan, for example.
342         * <p>
343         * This does not check that the author's initials or the extension
344         * are actually acceptable other than that they are syntactically valid.
345         */
346        public static boolean validNameFinalComponentSyntax(final CharSequence finalNameComponent)
347            {
348            // DO INITIAL SET OF VERY QUICK TESTS.
349            if(finalNameComponent == null)
350                { return(false); }
351            final int len = finalNameComponent.length();
352            // Reject filenames such as ``.'' and ``..'' quickly.
353            if(len < MIN_FILENAME_LENGTH) // Shortest possible name ``a-A.a''.
354                { return(false); }
355            // Allow room for a minimum-length directory component.
356            if(len > MAX_NAME_LENGTH - 2)
357                { return(false); }
358            // Make sure that this is a pure file component; no path.
359            if(TextUtils.indexOf(finalNameComponent, DIR_SEP) != -1)
360                { return(false); } // Contains a directory component.
361    
362            final int lastDot = TextUtils.lastIndexOf(finalNameComponent, '.');
363            if((lastDot < MIN_FILENAME_LENGTH - 2) || // Extension missing (-1), or this is hidden file (0) or initial part of name too short.
364               (lastDot == len-1)) // Bogus empty extension...
365                { return(false); }
366            if(TextUtils.indexOf(finalNameComponent, '.') != lastDot)
367                { return(false); } // More than one dot; ambiguous extension.
368    
369            // Start looking for the author initials.
370            final int lastDash = TextUtils.lastIndexOf(finalNameComponent, WORD_SEP);
371            // Dismiss some simple malformations quickly...
372            if((lastDash > lastDot) || // Bogus dash in extension.
373               (lastDash == lastDot-1) || // Zero-length author initials
374               (lastDash < 1)) // No dash at all (-1) or zero-length first word (0).
375                { return(false); }
376            // The auth initials must run from lastDash+1 up to just before lastDot
377            // and must be upper-case letters.
378            if(!validAuthorSyntax(finalNameComponent, lastDash+1, lastDot))
379                { return(false); }
380    
381            // Check that the text up to the last dash consists of
382            // one or more dash-terminated words each of which consists
383            // of one or more ASCII letters and digits.
384            // (Two consecutive dashes are not allowed.)
385    
386            // nextWord is positioned successively at the start of
387            // each word.
388            doNextWord: for(int nextWord = 0; nextWord <= lastDash; )
389                {
390                // Gobble up one word checking that it is not zero-length
391                // and contains only valid characters.
392                // We will stop at lastDash at worst.
393                for(int i = nextWord; ; ++i)
394                    {
395                    final char c = finalNameComponent.charAt(i);
396                    if(c == WORD_SEP)
397                        {
398                        if(i == nextWord) { return(false); } // Empty word.
399                        // OK, one valid word seen...
400                        nextWord = i + 1; // Skip to start of next word.
401                        continue doNextWord; // Process next word.
402                        }
403    
404                    // Check that the current word char is valid.
405                    if(!validWordCharacter(c))
406                        { return(false); } // Not a valid word character.
407                    }
408                }
409    
410            return(true); // Seems OK!
411            }
412    
413        /**Very quick basic set of name validity checks; returns true if valid.
414         * Short-cut where the type is statically known to be Name.ExhibitFull;
415         * just checks the value is non-null and if is assumes to already have been validated.
416         */
417        public static boolean validNameSyntaxBasic(final Name.ExhibitFull name)
418            { return(null != name); }
419    
420        /**Very quick basic set of name validity checks; returns true if valid.
421         * Very quick <em>constant-time</em> checks that the name is not null
422         * and is of a legitimate length.
423         */
424        public static boolean validNameSyntaxBasic(final CharSequence name)
425            {
426            if(name == null)
427                { return(false); }
428            final int len = name.length();
429            if(len < 2 + MIN_FILENAME_LENGTH) // Shortest possible name a/a-A.a.
430                { return(false); }
431            if(len > MAX_NAME_LENGTH)
432                { return(false); }
433            return(true);
434            }
435    
436        /**Fully validates the syntax of a name; returns true if valid.
437         * Short-cut where the type is statically known to be Name.ExhibitFull;
438         * just checks the value is non-null and if is assumes to already have been validated.
439         */
440        public static boolean validNameSyntax(final Name.ExhibitFull name)
441            { return(null != name); }
442    
443        /**Fully validates the syntax of a name; returns true if valid.
444         * If the run-time type is (non-null) Name.ExhibitFull then this immediately returns true.
445         * <p>
446         * Else this forces validation of the content.
447         */
448        public static boolean validNameSyntax(final CharSequence name)
449            {
450            if(name instanceof Name.ExhibitFull) { return(true); }
451            return(validNameSyntaxForce(name));
452            }
453    
454        /**Always fully validates the syntax of a name's content; returns true if valid.
455         * It attempts to be fast and to not create too many intermediate/temporary
456         * objects.
457         * <p>
458         * This does not attempt to check a name against current databases
459         * nor return any parsed components.
460         */
461        public static boolean validNameSyntaxForce(final CharSequence name)
462            {
463    //System.err.println("  [validNameSyntax("+name+")]");
464            // DO INITIAL SET OF VERY QUICK TESTS.
465            if(!validNameSyntaxBasic(name))
466                { return(false); }
467            final int len = name.length();
468            // Must contain at least one of each . / and -.
469            final int firstSlash = TextUtils.indexOf(name, DIR_SEP);
470            final int lastSlash = TextUtils.lastIndexOf(name, DIR_SEP);
471            if(lastSlash >= len - MIN_FILENAME_LENGTH) // Final component too short.
472                { return(false); }
473            if((firstSlash < 1) || // No slash at all (-1) or first dir component absent (0) eg absolute path.
474               (TextUtils.lastIndexOf(name, WORD_SEP) < lastSlash)) // Must be at least one word and author in file component...
475                { return(false); }
476    
477    //System.err.println("  [validNameSyntax("+name+"): quick tests passed]");
478    
479            // Invalid initial component?
480            if(!validNameInitialComponentSyntax(name, firstSlash))
481                { return(false); }
482    //System.err.println("  [validNameSyntax("+name+"): validNameInitialComponentSyntax]");
483    
484            // Check intermediate components (if any).
485            // Must be of the form _more[0-9A-Z]*
486            if(lastSlash != firstSlash)
487                {
488    //System.err.println("  [validNameSyntax("+name+"): checking intermediate dir names...]");
489                // Next part points to the _more prefix of the next
490                // intermediate directory.
491                nextDirComponent: for(int nextPart = firstSlash + 1; nextPart <= lastSlash; )
492                    {
493    //System.err.println("   [validNameSyntax("+name+"): looking at "+name.substring(nextPart)+"]");
494                    if(!TextUtils.regionMatches(name, nextPart, intermediateDirPrefix, 0, _iDirPrefixLen))
495                        { return(false); }
496    //System.err.println("   [validNameSyntax("+name+"): intermediate dir prefix OK]");
497    
498                    for(int tail = nextPart + _iDirPrefixLen; ; ++tail)
499                        {
500                        final char c = name.charAt(tail);
501                        if(c == DIR_SEP)
502                            {
503                            // No problems found in dir name component.
504                            nextPart = tail + 1; // Skip over the next slash.
505    //System.err.println("   [validNameSyntax("+name+"): found end of dir component]");
506                            continue nextDirComponent; // Start on next component.
507                            }
508                        if(!(((c >= '0') && (c <= '9')) ||
509                            ((c >= 'A') && (c <= 'Z'))))
510                            { return(false); } // Whoops; invalid character.
511    //System.err.println("   [validNameSyntax("+name+"): dir char "+c+" OK]");
512                        }
513                    }
514                }
515    //System.err.println("  [validNameSyntax("+name+"): intermediate components OK]");
516    
517            // Now thoroughly check the final component.
518            // Oh dear; this may construct a temporary object.
519            if(!validNameFinalComponentSyntax(name.subSequence(lastSlash + 1, len)))
520                { return(false); }
521    //System.err.println("  [validNameSyntax("+name+"): validNameFinalComponentSyntax]");
522            return(true);
523            }
524    
525        /**A simple invariant comparator that sorts full exhibit names in a human-friendly order.
526         * This is essentially a case-insensitive sort on the file component.
527         * <p>
528         * Ties are broken by a normal lexical ordering on the full names.
529         * <p>
530         * Discardable/attribute words are not discarded
531         * nor otherwise treated specially for this comparison.
532         */
533        public static final Comparator<CharSequence> SIMPLE_SMART_ORDER = (new Comparator<CharSequence>(){
534            public final int compare(final CharSequence cs1, final CharSequence cs2)
535                {
536                final boolean cs1NEF = Name.ExhibitFull.class.equals(cs1.getClass());
537                // Compare short/file components, using run-time optimisation for ExhibitFull elements...
538                final boolean cs2NEF = Name.ExhibitFull.class.equals(cs2.getClass());
539                final int ciResult = TextUtils.CASE_INSENSITIVE_ORDER.compare(
540                    cs1NEF ? ((Name.ExhibitFull)cs1).getShortName() : getFileComponent(cs1),
541                    cs2NEF ? ((Name.ExhibitFull)cs2).getShortName() : getFileComponent(cs2));
542                if(ciResult != 0) { return(ciResult); }
543    
544                // Needs a tie-break by full name.
545                if(cs1NEF && cs2NEF) { return(((Name.ExhibitFull) cs1).compareTo((Name.ExhibitFull) cs2)); } // Run-time optimisation for ExhibitFull elements.
546                return(TextUtils.compare(cs1, cs2));
547                }
548            });
549    
550        /**Extract the file component (short name) of a full exhibit name, assuming the name is valid.
551         * Two exhibits should always be distinguishable by this component,
552         * also known as the "short" name.
553         * <p>
554         * If the argument is not a valid full exhibit name, the result is undefined.
555         * <p>
556         * See also ExhibitFullName.getShortName().
557         */
558        public static CharSequence getFileComponent(final CharSequence fullExhibitName)
559            {
560            assert((fullExhibitName instanceof Name.ExhibitFull) || validNameSyntax(fullExhibitName));
561            final int length = fullExhibitName.length();
562            return(fullExhibitName.subSequence(TextUtils.lastIndexOf(fullExhibitName, DIR_SEP, length - MIN_FILENAME_LENGTH) + 1, length));
563            }
564    
565        /**Extract the category component (top directory) of a full exhibit name, assuming the name is valid.
566         * If the argument is not a valid full exhibit name, the result is undefined.
567         */
568        public static CharSequence getCategoryComponent(final CharSequence fullExhibitName)
569            {
570            assert((fullExhibitName instanceof Name.ExhibitFull) || validNameSyntax(fullExhibitName));
571            return(fullExhibitName.subSequence(0, TextUtils.indexOf(fullExhibitName, DIR_SEP, 1)));
572            }
573    
574        /**Extract the full directory component of a full exhibit name, assuming the name is valid.
575         * This does not include the trailing directory separator.
576         * <p>
577         * If the argument is not a valid full exhibit name, the result is undefined.
578         */
579        public static CharSequence getDirComponent(final CharSequence fullExhibitName)
580            {
581            assert((fullExhibitName instanceof Name.ExhibitFull) || validNameSyntax(fullExhibitName));
582            return(fullExhibitName.subSequence(0, TextUtils.lastIndexOf(fullExhibitName, DIR_SEP, fullExhibitName.length() - MIN_FILENAME_LENGTH)));
583            }
584    
585        /**Find the index of the end of the attribute words for a short or long exhibit name; strictly positive.
586         * @param exhibitName  valid full or short exhibit name; never null
587         */
588        public static int getEndOfAttrWords(final CharSequence exhibitName)
589            {
590            // Find the end of the attribute words.
591            final int l = exhibitName.length();
592            final int lastDash = TextUtils.lastIndexOf(exhibitName, WORD_SEP, l - 3); // Before author, eg as in "-A.a".
593            // Skip back one word to omit number-in-series value if present.
594            final int endOfAttrWords =
595                (getNumberInSeriesComponentAsString(exhibitName) == null) ?
596                lastDash : TextUtils.lastIndexOf(exhibitName, WORD_SEP, lastDash-1);
597            return(endOfAttrWords);
598            }
599    
600        /**Find the index of the end of the main words for a short or long exhibit name; strictly positive.
601         * @param exhibitName  valid full or short exhibit name; never null
602         * @param lastSlash position of last DIR_SEP or -1 for a short name
603         * @param endOfAttrWords  as returned by getEndOfAttrWords()
604         * @param allAttrWords  a Set of all legal attribute words (String values);
605         *     may be empty but not null
606         */
607        public static int getEndOfMainWords(final CharSequence exhibitName,
608                                             final int lastSlash,
609                                             final int endOfAttrWords,
610                                             final Set<String> allAttrWords)
611            {
612            assert(exhibitName.charAt(endOfAttrWords) == WORD_SEP);
613            // End of main word; same as end of attr words if no attr words.
614            int endOfMainWords = endOfAttrWords; // Always pointing to a dash.
615    
616            // If there are no attribute words then we can skip some processing...
617            if(!allAttrWords.isEmpty())
618                {
619                // Find end of first main word,
620                // given that there must be at least one (of at least length 1).
621                final int endOfFirstMainWord = TextUtils.indexOf(exhibitName, WORD_SEP, lastSlash+2);
622    
623                // Work back a word at a time leaving the boundary after the
624                // first non-attribute word.
625                while(endOfMainWords > endOfFirstMainWord)
626                    {
627                    //assert(fullExhibitName.charAt(endOfMainWords, WORD_SEP));
628    
629                    // We can skip back 2 each time since each word must be at least
630                    // a single letter long preceded by a dash.
631                    final int previousDash = TextUtils.lastIndexOf(exhibitName, WORD_SEP, endOfMainWords-2);
632    
633                    final String putativeAttrWord =
634                        exhibitName.subSequence(previousDash + 1, endOfMainWords).toString();
635                    if(!allAttrWords.contains(putativeAttrWord)) { break; }
636                    // If the word was an attribute word then move the boundary back.
637                    endOfMainWords = previousDash;
638                    }
639                }
640    
641            return(endOfMainWords);
642            }
643    
644        /**Extract the main words (stem) component of a valid short exhibit name; never null nor empty.
645         * As getMainWordsComponentFrom() but optimised for this common case,
646         * and should be more efficient than extracting from a full name as less text to scan.
647         *
648         * @param allAttrWords  a Set of all legal attribute words (String values);
649         *     may be empty but not null
650         */
651        public static CharSequence getMainWordsComponentFromShortName(
652                                                    final CharSequence shortExhibitName,
653                                                    final Set<String> allAttrWords)
654            {
655            if(null == shortExhibitName) { throw new IllegalArgumentException(); }
656            assert((shortExhibitName instanceof Name.ExhibitShort) || validNameFinalComponentSyntax(shortExhibitName)) : ("not valid short name: " + shortExhibitName);
657    
658            final int endOfAttrWords = getEndOfAttrWords(shortExhibitName);
659            final int endOfMainWords = getEndOfMainWords(shortExhibitName, -1, endOfAttrWords, allAttrWords);
660            final CharSequence result = shortExhibitName.subSequence(0, endOfMainWords);
661            // Should be identical result to getMainWordsComponent().
662    //        assert(TextUtils.contentEquals(result, getMainWordsComponent(shortExhibitName, allAttrWords)));
663            return(result);
664            }
665    
666        /**Find end of main stem and of attribute words of the supplied short or full exhibit name.
667         * This returns a three-element array, of which:
668         * <ul>
669         * <li>Element 0 is the position of the separator immediately
670         *     before the first main word (a DIR_SEP),
671         *     -1 for a short name
672         * <li>Element 1 is the position (index in the input String)
673         * of the separator immediately following the final word of
674         * the main stem of the exhibit name.
675         * All the words of the main stem precede it,
676         * and any attribute words and number-in-series value follow it.
677         * <li>Element 2 of the result is the position (index in the input String)
678         * of the separator immediately following the final attribute word of
679         * the exhibit name.
680         * (If there are no attribute words this will be the same value
681         * as element 1.)
682         * Any number-in-series value follows it.
683         * </ul>
684         * <p>
685         * This has to be passed a set of all valid attribute words
686         * (as Strings which meet the requirements of validAttributeWord())
687         * to be able to compute this boundary.
688         * <p>
689         * If the argument is not a valid full exhibit name, the result is undefined.
690         *
691         * @param allAttrWords  a Set of all legal attribute words (String values);
692         *     may be empty but not null
693         */
694        public static int[] getMainAndAttrWordComponentBoundaries(
695                                                    final CharSequence exhibitName,
696                                                    final Set<String> allAttrWords)
697            {
698            if(null == exhibitName) { throw new IllegalArgumentException(); }
699            assert((exhibitName instanceof Name.ExhibitFull) || (exhibitName instanceof Name.ExhibitShort) ||
700                validNameSyntax(exhibitName) || validNameFinalComponentSyntax(exhibitName)) : ("not valid full or short name: " + exhibitName);
701    
702            final int endOfAttrWords = getEndOfAttrWords(exhibitName);
703    
704            // The last slash is the character before the first main word; -1 if no slash because a short name.
705            final int l = exhibitName.length();
706            final int lastSlash = TextUtils.lastIndexOf(exhibitName, DIR_SEP, l - MIN_FILENAME_LENGTH);
707    
708            // End of main word; same as end of attr words if no attr words.
709            final int endOfMainWords = getEndOfMainWords(exhibitName, lastSlash, endOfAttrWords, allAttrWords);
710    
711    //        assert (lastSlash>0) && (lastSlash < endOfMainWords) && (endOfMainWords <= endOfAttrWords) && (endOfAttrWords <= lastDash);
712    //System.out.println("lastSlash, endOfMainWords, endOfAttrWords: " + lastSlash + ", " + endOfMainWords + ", " + endOfAttrWords);
713    
714            // Return offsets...
715            return(new int[]{ lastSlash, endOfMainWords, endOfAttrWords });
716            }
717    
718    
719        /**Extract the main words (stem) component of a valid full or short exhibit name; never null nor empty.
720         * There is always at least one main word;
721         * so the result is always non-null and non-empty.
722         * <p>
723         * (This does not end nor start with a separator.)
724         * <p>
725         * If the argument is not a valid full or short exhibit name, the result is undefined.
726         *
727         * @param allAttrWords  a Set of all legal attribute words (String values)
728         */
729        public static CharSequence getMainWordsComponent(final CharSequence exhibitName, final Set<String> allAttrWords)
730            {
731            // We rely on getMainAndAttrWordComponentBoundaries() to check args for us.
732            final int[] offsets = getMainAndAttrWordComponentBoundaries(exhibitName, allAttrWords);
733    //        assert(exhibitName.charAt(offsets[0]+1) != WORD_SEP); // First char returned should not be a separator.
734    //        assert(exhibitName.charAt(offsets[1]-1) != WORD_SEP); // Last char returned should not be a separator.
735    //        assert(exhibitName.charAt(offsets[1]) == WORD_SEP); // Char after last returned should be a separator.
736            return(exhibitName.subSequence(offsets[0]+1, offsets[1]));
737            }
738    
739    
740        /**Count the main words int the (stem) component of a valid full or short exhibit name; strictly positive.
741         *
742         * @param allAttrWords  a Set of all legal attribute words (String values)
743         */
744        public static int getMainWordsCount(final CharSequence exhibitName, final Set<String> allAttrWords)
745            {
746    //        assert(validNameSyntax(fullExhibitName)) : ("not valid full name: " + fullExhibitName);
747            int wordCount = 1; // One more word than separator...
748            final CharSequence mwc = ExhibitName.getMainWordsComponent(exhibitName, Collections.<String>emptySet());
749            for(int i = mwc.length(); --i >= 0; )
750                { if(mwc.charAt(i) == ExhibitName.WORD_SEP) { ++wordCount; } }
751            assert(wordCount > 0);
752            return(wordCount);
753            }
754    
755        /**Return Enumeration over main words of a valid full or short name; never null, never empty if the name is well-formed.
756         * Uses StringTokenizer, thus slow and inefficient.
757         */
758        public static Enumeration<?> getMainWords(final CharSequence exhibitName, final Set<String> allAttrWords)
759            {
760            return((new StringTokenizer(getMainWordsComponent(exhibitName, allAttrWords).toString(), WORD_SEPS)));
761            }
762    
763        /**Extract the attribute words component of a full exhibit name, assuming the name is valid.
764         * If there are no attribute words this returns null,
765         * else the result is non-empty
766         * and is the fragment of the full name containing the attribute word
767         * with the words separated as usual.
768         * <p>
769         * If the argument is not a valid full exhibit name, the result is undefined.
770         *
771         * @param allAttrWords  a Set of all legal attribute words (String values)
772         */
773        public static CharSequence getAttributeWordsComponent(final CharSequence fullExhibitName, final Set<String> allAttrWords)
774            {
775            assert((fullExhibitName instanceof Name.ExhibitFull) || validNameSyntax(fullExhibitName));
776            final int[] offsets = getMainAndAttrWordComponentBoundaries(fullExhibitName, allAttrWords);
777            if(offsets[1] == offsets[2]) { return(null); } // No attr words.
778            return(fullExhibitName.subSequence(offsets[1]+1, offsets[2]));
779            }
780    
781        /**Extract the attribute words component of a full exhibit name as an Enumeration of String, assuming the name is valid.
782         * If there are no attribute words this returns null,
783         * else the result is non-empty
784         * and is an Enumeration of String values of the attributes in order.
785         * <p>
786         * If the argument is not a valid full exhibit name, the result is undefined.
787         *
788         * @param allAttrWords  a Set of all legal attribute words (String values)
789         */
790        public static Enumeration<?> getAttributeWordsComponentEnumeration(final CharSequence fullExhibitName, final Set<String> allAttrWords)
791            {
792            assert((fullExhibitName instanceof Name.ExhibitFull) || validNameSyntax(fullExhibitName));
793            final CharSequence s = getAttributeWordsComponent(fullExhibitName, allAttrWords);
794            if(s == null) { return(null); }
795            return(new StringTokenizer(s.toString(), WORD_SEPS));
796            }
797    
798        /**Extract the attribute words component of a full exhibit name as a SortedSet of String, assuming the name is valid; never null.
799         * If there are no attribute words this returns
800         * a (fixed, immutable) empty set.
801         * <p>
802         * Duplicates attribute words are automatically eliminated
803         * <p>
804         * If the argument is not a valid full exhibit name, the result is undefined.
805         *
806         * @param allAttrWords  a Set of all legal attribute words (String values)
807         *
808         * @return non-null, de-duped, alpha-sorted attribute words from the name
809         */
810        public static SortedSet<String> getAttributeWordsComponentSortedSet(final CharSequence fullExhibitName,
811                                                                            final Set<String> allAttrWords)
812            {
813            assert((fullExhibitName instanceof Name.ExhibitFull) || validNameSyntax(fullExhibitName));
814            final CharSequence s = getAttributeWordsComponent(fullExhibitName, allAttrWords);
815            if(s == null) { return(NO_ATTR_WORDS); }
816    
817            final SortedSet<String> result = new TreeSet<String>();
818            final StringTokenizer st = new StringTokenizer(s.toString(), WORD_SEPS);
819            while(st.hasMoreElements()) { result.add(st.nextToken()); }
820    
821            return(result);
822            }
823    
824        /**Immutable empty attribute word set. */
825        private static final SortedSet<String> NO_ATTR_WORDS =
826            Collections.unmodifiableSortedSet(new TreeSet<String>());
827    
828        /**Extract the number-in-series component of a full or short exhibit name, assuming the name is valid.
829         * If the argument is not a valid full or short exhibit name then the result is undefined.
830         * <p>
831         * A missing number-in-series value causes us to return null.
832         * <p>
833         * This is:
834         * <ul>
835         * <li>the final word before the author initials,
836         * <li>must consist entirely of digits,
837         * <li>is always non-negative,
838         * <li>is always to be interpreted in radix 10,
839         *     so "010" = "10" = 10 (ie ten),
840         * <li>and is not the only word.
841         * </ul>
842         */
843        public static CharSequence getNumberInSeriesComponentAsString(final CharSequence exhibitName)
844            {
845            assert(((exhibitName instanceof Name.ExhibitFull) || validNameSyntax(exhibitName)) || ((exhibitName instanceof Name.ExhibitShort) || validNameFinalComponentSyntax(exhibitName)));
846            // The number-in-series value, if present,
847            // is the final word and contains only digits
848            // and must not be the only word.
849            final int l = exhibitName.length();
850            final int lastDash = TextUtils.lastIndexOf(exhibitName, WORD_SEP, l - 3); // Must be at least one-char auth and extension, eg "-A.a".
851            final int previousDash = TextUtils.lastIndexOf(exhibitName, WORD_SEP, lastDash-1);
852            if(previousDash <= 0) { return(null); } // Only word?
853            final CharSequence putativeNiS = exhibitName.subSequence(previousDash+1, lastDash);
854            for(int i = putativeNiS.length(); --i >= 0; )
855                {
856                final char c = putativeNiS.charAt(i);
857                if((c < '0') || (c > '9')) { return(null); } // Not pure number.
858                }
859            // Looks OK!
860            return(putativeNiS);
861            }
862    
863        /**Extract the number-in-series component of a full exhibit name as a non-negative int, assuming the name is valid.
864         * If the argument is not a valid full exhibit name, the result is undefined.
865         * <p>
866         * A missing number-in-series value causes us to return zero.
867         * <p>
868         * This is:
869         * <ul>
870         * <li>the final word before the author initials,
871         * <li>must consist entirely of digits,
872         * <li>is always non-negative,
873         * <li>is always to be interpreted in radix 10,
874         *     so "010" = "10" = 10 (ie ten),
875         * <li>and is not the only word.
876         * </ul>
877         *
878         * @return  positive number-in-series value, or zero if absent
879         */
880        public static int getNumberInSeriesComponent(final CharSequence fullExhibitName)
881            {
882            final CharSequence nis = getNumberInSeriesComponentAsString(fullExhibitName);
883            if(nis == null) { return(0); }
884            return(Integer.parseInt(nis.toString(), 10));
885            }
886    
887        /**Extract the author component of a valid full or short exhibit name, assuming the name is valid.
888         * If the argument is not a valid full exhibit name, the result is undefined
889         * <em>unless</em> the name is the final (file/short) component of a valid name.
890         */
891        public static CharSequence getAuthorComponent(final CharSequence exhibitName)
892            {
893            assert(((exhibitName instanceof Name.ExhibitFull) || validNameSyntax(exhibitName)) || ((exhibitName instanceof Name.ExhibitShort) || validNameFinalComponentSyntax(exhibitName)));
894            // The author initials lie between the last dash and the last dot.
895            final int l = exhibitName.length();
896            final int lastDash = TextUtils.lastIndexOf(exhibitName, WORD_SEP, l - 3); // Must be at least one-char auth and extension, eg "-A.a".
897            final int lastDot = TextUtils.lastIndexOf(exhibitName, '.', l - 1); // Must be at least one-char extension, eg ".a".
898            return(exhibitName.subSequence(lastDash+1, lastDot));
899            }
900    
901        /**Extract the extension (without dot) of a valid full or short exhibit name, assuming the name is valid.
902         * If the argument is not a valid full exhibit name, the result is undefined
903         * <em>unless</em> the name is the final (file/short) component of a valid name.
904         */
905        public static CharSequence getExtensionComponent(final CharSequence exhibitName)
906            {
907            assert(((exhibitName instanceof Name.ExhibitFull) || validNameSyntax(exhibitName)) || ((exhibitName instanceof Name.ExhibitShort) || validNameFinalComponentSyntax(exhibitName)));
908            final int l = exhibitName.length();
909            final int lastDot = TextUtils.lastIndexOf(exhibitName, '.', l - 1); // Must be at least one-char extension, eg ".a".
910            return(exhibitName.subSequence(lastDot+1, l));
911            }
912        }