001 /*
002 Copyright (c) 1996-2012, Damon Hart-Davis
003 All rights reserved.
004
005 Redistribution and use in source and binary forms, with or without
006 modification, are permitted provided that the following conditions are
007 met:
008
009 * Redistributions of source code must retain the above copyright
010 notice, this list of conditions and the following disclaimer.
011
012 * Redistributions in binary form must reproduce the above copyright
013 notice, this list of conditions and the following disclaimer in the
014 documentation and/or other materials provided with the
015 distribution.
016
017 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
018 IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
019 TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
020 PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
021 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
022 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
023 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
024 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
025 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
026 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
027 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
028 */
029 package org.hd.d.pg2k.svrCore;
030
031 import java.util.Collections;
032 import java.util.Comparator;
033 import java.util.Enumeration;
034 import java.util.Set;
035 import java.util.SortedSet;
036 import java.util.StringTokenizer;
037 import java.util.TreeSet;
038
039 /**Utility routines to validate/parse an exhibit name as a String/CharSequence.
040 * An exhibit name is a (relative) path name in a filesystem representation,
041 * and a (relative) URL in a Web presentation.
042 * <p>
043 * This is based on the assumption that we use the String name
044 * everywhere as a sort of universal currency, rather than pre-parsing
045 * everything as in the old Attributes.ItemName.
046 * <p>
047 * The syntax of the file name is:
048 * <ol>
049 * <li>Printable 7-bit ASCII.
050 * <li>A category directory name (consisting of characters from the
051 * range [a-z-_] starting with a character in the range [a-z]
052 * <li>Zero or more directory components of the form _more[0-9A-Z]*
053 * <li>A final unique `file' name consisting of hyphen-separated
054 * words ending with a recognised extension that indicates the
055 * MIME type of the underlying file; full details of the syntax
056 * of this part are given below.
057 * </ol>
058 * <p>
059 * The syntax of a Gallery image name is as follows:
060 * {word-}+{discardableword-}*[number-]AUTH.ext
061 * where word is an alphanumeric sequence
062 * containing at least one letter (there must
063 * be at least one such word in the name),
064 * discardableword is like word in syntax but
065 * comes from a small list of words that can
066 * trail the main image description and indicate
067 * some gross features of the image, eg `bg'
068 * and `mono' (these words are optional and
069 * will only be recognised as discardable
070 * if after all non-discardable words),
071 * number is a optional decimal number consisting
072 * purely of the digits 0-9 (with any leading
073 * zeros ignored rather than indicating octal),
074 * AUTH being the all-upper-case all-alpha
075 * author's initials (this is compulsory), and
076 * ext being the extension indicating the image
077 * type (this is compulsory).
078 * <p>
079 * Note that other than the extension, all components
080 * are delimited by dashes, and there must be no spaces
081 * in the name. The extension may contain dots, and
082 * dots are not allowed elsewhere in the name.
083 * <p>
084 * For the purposes of sorting, the sort order
085 * is first by the [word-]+ portion (ASCII order),
086 * then by the author (ASCII order),
087 * then by the number portion (numerically),
088 * then by the {discardableword-}* portion (ASCII order),
089 * then by the extension (ASCII order).
090 */
091 public final class ExhibitName
092 {
093 /**The character used to separate words. */
094 public static final char WORD_SEP = '-';
095 /**The single character used to separate words as a String value for convenience. */
096 public static final String WORD_SEPS = "-";
097
098 /**The character used to separate directory components. */
099 public static final char DIR_SEP = '/';
100
101 /**Maximum valid name length.
102 * This is basically limited by maximum URL and UNIX-filename length,
103 * and the fact that we need some "decoration" overhead of exhibit names
104 * when embedded in filenames, URLs, etc.
105 */
106 public static final int MAX_NAME_LENGTH = 1023 - 128;
107
108 /**Minimum length of file component of any valid name.
109 * Such a minimal name must be of the form ``a-A.a''.
110 * In practice any real name must be longer since we will
111 * not have one-character author initials nor file extensions,
112 * but this will help quickly discard directory entries
113 * such as "." and ".." for example.
114 */
115 public static final int MIN_FILENAME_LENGTH = 5;
116
117 /**Minimum valid name length.
118 * Of form "a/a-A.a".
119 */
120 public static final int MIN_NAME_LENGTH = 2 + MIN_FILENAME_LENGTH;
121
122 /**Maximum allowable length of any single word.
123 * This is where the name is of the form a/word-A.a.
124 */
125 public static final int MAX_WORD_LENGTH = MAX_NAME_LENGTH - 6;
126
127 /**Maximum allowable length of any name stem (ie just main words).
128 * This is where the name is of the form a/stem-A.a
129 * where the stem is one or more words.
130 */
131 public static final int MAX_STEM_LENGTH = MAX_WORD_LENGTH;
132
133 /* Maximum allowable attribute word length.
134 * Maximum attribute word must be at most long enough to allow for
135 * a single-letter main word and dash,
136 * so is two less than the longest allowable main word.
137 * This is where the name is of the form ``a/a-attrword-A.a''.
138 */
139 public static final int MAX_ATTR_WORD_LENGTH = MAX_WORD_LENGTH - 2;
140
141 /**Prefix of intermediate directory components in name. */
142 public static final String intermediateDirPrefix = "_more";
143 /**Length of prefix of intermediate directory components in name. */
144 private static final int _iDirPrefixLen = 5;
145
146 /**Minimum length of author-initial component; strictly positive. */
147 public static final int MIN_AUTH_INITIALS_LENGTH = 1;
148
149 /**Maximum length of author-initial component. */
150 public static final int MAX_AUTH_INITIALS_LENGTH = 8;
151
152 /**Validates the syntax of the first component of a name; returns true if valid.
153 * This is helpful to identify the roots of a directory scan, for example.
154 * <p>
155 * This ignores any characters from len onwards, so the portion from
156 * 0--len-1 must be a complete valid initial directory, and the string
157 * passed must be at least len characters long.
158 * <p>
159 * Designed to be efficiently callable from validNameSyntax() without
160 * requiring creation of any objects.
161 */
162 public static boolean validNameInitialComponentSyntax(final CharSequence initialNameComponent,
163 final int len)
164 {
165 // DO INITIAL SET OF VERY QUICK TESTS.
166 //Assert(initialNameComponent != null);
167 //Assert(len <= initialNameComponent.length());
168 if(len < 1) // Shortest possible name ``a''.
169 { return(false); }
170 // Allow room for a minimal name tail.
171 if(len > MAX_NAME_LENGTH - (MIN_FILENAME_LENGTH-1))
172 { return(false); }
173
174 // // Make sure that this is a pure single dir component; no path.
175 // // Note that this is implicit in the following tests.
176 // if(-1 != initialNameComponent.lastIndexOf(DIR_SEP, len-1)) { return(false); }
177
178 // First character of (directory component of) name
179 // must be lower-case (ASCII) letter.
180 final char firstChar = initialNameComponent.charAt(0);
181 if((firstChar < 'a') || (firstChar > 'z'))
182 { return(false); }
183
184 // Now check the first dir/category component fully,
185 // from after the first char.
186 // Just lower-case letters and dash or underscore are permitted.
187 for(int i = len; --i > 0; )
188 {
189 final char c = initialNameComponent.charAt(i);
190 if(((c < 'a') || (c > 'z')) &&
191 (c != WORD_SEP) &&
192 (c != '_'))
193 { return(false); }
194 }
195
196 return(true); // Initial dir component OK!
197 }
198
199 /**Validates a set of author's initials for syntax; returns true if valid.
200 * This version examines part of a string.
201 * <p>
202 * Characters from start to end-1 are checked.
203 * <p>
204 * The start and end positions must be valid in the CharSequence passed and start
205 * start must come before end.
206 * <p>
207 * Returns false if the string is null.
208 *
209 * @param s the value to be examined
210 * @param start the starting position of the author;
211 * must be valid offset within string
212 * @param end just after the end of the author;
213 * must be greater than start and no greater than the string length
214 */
215 public static boolean validAuthorSyntax(final CharSequence s,
216 final int start,
217 final int end)
218 {
219 if(s == null)
220 { return(false); }
221
222 if((start < 0) || (start > end) || (end > s.length()))
223 { throw new IllegalArgumentException(); }
224
225 final int len = end - start;
226 if(len < MIN_AUTH_INITIALS_LENGTH) { return(false); }
227 if(len > MAX_AUTH_INITIALS_LENGTH) { return(false); }
228
229 // The auth initials must run from start up to just before end
230 // and must be upper-case letters.
231 for(int i = end; --i >= start; )
232 {
233 final char c = s.charAt(i);
234 if((c < 'A') || (c > 'Z'))
235 { return(false); } // Invalid author initial.
236 }
237
238 // Seems OK.
239 return(true);
240 }
241
242 /**Validates a set of author's initials for syntax; returns true if valid.
243 * The whole string must be a valid author ID (and not null).
244 *
245 * @return true iff s is not null and is a valid set of author initials
246 */
247 public static boolean validAuthorSyntax(final CharSequence s)
248 { return((s != null) && validAuthorSyntax(s, 0, s.length())); }
249
250
251 /**Checks that the CharSequence passed to it is a valid word (main or attribute).
252 * This means it must be non-zero length (and non-null),
253 * and consist only of letters and digits.
254 */
255 public static boolean validWord(final CharSequence s)
256 {
257 if((s == null) ||
258 (s.length() == 0) || (s.length() > MAX_WORD_LENGTH))
259 { return(false); }
260
261 // Make sure that all characters are individually acceptable...
262 for(int i = s.length(); --i >= 0; )
263 {
264 if(!validWordCharacter(s.charAt(i)))
265 { return(false); }
266 }
267
268 return(true); // Seems OK...
269 }
270
271
272 /**Checks that the CharSequence passed to it is a valid attribute word.
273 * This means it must be non-zero length (and non-null),
274 * and consist only of letters and digits
275 * and must not consist entirely of digits or upper-case letters
276 * (to avoid ambiguity with the number-in-series value and author).
277 * <p>
278 * Maximum attribute word must be at most long enough to allow for
279 * a single-letter main word and dash,
280 * so is two less than the longest allowable main word.
281 */
282 public static boolean validAttributeWord(final CharSequence s)
283 {
284 if((s == null) ||
285 (s.length() == 0) || (s.length() > MAX_ATTR_WORD_LENGTH))
286 { return(false); }
287
288 // Make sure that the word is valid in itself...
289 if(!validWord(s)) { return(false); }
290
291 final char firstChar = s.charAt(0);
292 if((firstChar < '0') || (firstChar > '9'))
293 {
294 // Make sure that word is not entirely digits.
295 for(int i = s.length(); --i >= 0; )
296 {
297 final char c = s.charAt(i);
298 if((c < '0') || (c > '9'))
299 { return(true); } // Not purely numerical...
300 }
301 return(false); // Whoops, was only digits.
302 }
303 else if((firstChar < 'A') || (firstChar > 'Z'))
304 {
305 // Make sure that word is not upper-case.
306 for(int i = s.length(); --i >= 0; )
307 {
308 final char c = s.charAt(i);
309 if((c < 'A') || (c > 'Z'))
310 { return(true); } // Not purely numerical...
311 }
312 return(false); // Whoops, was only digits.
313 }
314
315 return(true); // Seems OK...
316 }
317
318 /**Validates the syntax of the first component of a name; returns true if valid.
319 * This is helpful to identify the roots of a directory scan, for example.
320 */
321 public static boolean validNameInitialComponentSyntax(final CharSequence initialNameComponent)
322 {
323 if(initialNameComponent == null)
324 { return(false); }
325 return(validNameInitialComponentSyntax(initialNameComponent, initialNameComponent.length()));
326 }
327
328 /**Returns true if the character passed is a valid word character.
329 * A valid word character is an ASCII digit or letter (either case).
330 * <p>
331 * We test the most common cases first for speed.
332 */
333 public static boolean validWordCharacter(final char c)
334 {
335 return(((c >= 'a') && (c <= 'z')) || // Most commonly lower-case.
336 ((c >= 'A') && (c <= 'Z')) ||
337 ((c >= '0') && (c <= '9')));
338 }
339
340 /**Validates the syntax of the last component of a name; returns true if valid.
341 * This is helpful when running a directory scan, for example.
342 * <p>
343 * This does not check that the author's initials or the extension
344 * are actually acceptable other than that they are syntactically valid.
345 */
346 public static boolean validNameFinalComponentSyntax(final CharSequence finalNameComponent)
347 {
348 // DO INITIAL SET OF VERY QUICK TESTS.
349 if(finalNameComponent == null)
350 { return(false); }
351 final int len = finalNameComponent.length();
352 // Reject filenames such as ``.'' and ``..'' quickly.
353 if(len < MIN_FILENAME_LENGTH) // Shortest possible name ``a-A.a''.
354 { return(false); }
355 // Allow room for a minimum-length directory component.
356 if(len > MAX_NAME_LENGTH - 2)
357 { return(false); }
358 // Make sure that this is a pure file component; no path.
359 if(TextUtils.indexOf(finalNameComponent, DIR_SEP) != -1)
360 { return(false); } // Contains a directory component.
361
362 final int lastDot = TextUtils.lastIndexOf(finalNameComponent, '.');
363 if((lastDot < MIN_FILENAME_LENGTH - 2) || // Extension missing (-1), or this is hidden file (0) or initial part of name too short.
364 (lastDot == len-1)) // Bogus empty extension...
365 { return(false); }
366 if(TextUtils.indexOf(finalNameComponent, '.') != lastDot)
367 { return(false); } // More than one dot; ambiguous extension.
368
369 // Start looking for the author initials.
370 final int lastDash = TextUtils.lastIndexOf(finalNameComponent, WORD_SEP);
371 // Dismiss some simple malformations quickly...
372 if((lastDash > lastDot) || // Bogus dash in extension.
373 (lastDash == lastDot-1) || // Zero-length author initials
374 (lastDash < 1)) // No dash at all (-1) or zero-length first word (0).
375 { return(false); }
376 // The auth initials must run from lastDash+1 up to just before lastDot
377 // and must be upper-case letters.
378 if(!validAuthorSyntax(finalNameComponent, lastDash+1, lastDot))
379 { return(false); }
380
381 // Check that the text up to the last dash consists of
382 // one or more dash-terminated words each of which consists
383 // of one or more ASCII letters and digits.
384 // (Two consecutive dashes are not allowed.)
385
386 // nextWord is positioned successively at the start of
387 // each word.
388 doNextWord: for(int nextWord = 0; nextWord <= lastDash; )
389 {
390 // Gobble up one word checking that it is not zero-length
391 // and contains only valid characters.
392 // We will stop at lastDash at worst.
393 for(int i = nextWord; ; ++i)
394 {
395 final char c = finalNameComponent.charAt(i);
396 if(c == WORD_SEP)
397 {
398 if(i == nextWord) { return(false); } // Empty word.
399 // OK, one valid word seen...
400 nextWord = i + 1; // Skip to start of next word.
401 continue doNextWord; // Process next word.
402 }
403
404 // Check that the current word char is valid.
405 if(!validWordCharacter(c))
406 { return(false); } // Not a valid word character.
407 }
408 }
409
410 return(true); // Seems OK!
411 }
412
413 /**Very quick basic set of name validity checks; returns true if valid.
414 * Short-cut where the type is statically known to be Name.ExhibitFull;
415 * just checks the value is non-null and if is assumes to already have been validated.
416 */
417 public static boolean validNameSyntaxBasic(final Name.ExhibitFull name)
418 { return(null != name); }
419
420 /**Very quick basic set of name validity checks; returns true if valid.
421 * Very quick <em>constant-time</em> checks that the name is not null
422 * and is of a legitimate length.
423 */
424 public static boolean validNameSyntaxBasic(final CharSequence name)
425 {
426 if(name == null)
427 { return(false); }
428 final int len = name.length();
429 if(len < 2 + MIN_FILENAME_LENGTH) // Shortest possible name a/a-A.a.
430 { return(false); }
431 if(len > MAX_NAME_LENGTH)
432 { return(false); }
433 return(true);
434 }
435
436 /**Fully validates the syntax of a name; returns true if valid.
437 * Short-cut where the type is statically known to be Name.ExhibitFull;
438 * just checks the value is non-null and if is assumes to already have been validated.
439 */
440 public static boolean validNameSyntax(final Name.ExhibitFull name)
441 { return(null != name); }
442
443 /**Fully validates the syntax of a name; returns true if valid.
444 * If the run-time type is (non-null) Name.ExhibitFull then this immediately returns true.
445 * <p>
446 * Else this forces validation of the content.
447 */
448 public static boolean validNameSyntax(final CharSequence name)
449 {
450 if(name instanceof Name.ExhibitFull) { return(true); }
451 return(validNameSyntaxForce(name));
452 }
453
454 /**Always fully validates the syntax of a name's content; returns true if valid.
455 * It attempts to be fast and to not create too many intermediate/temporary
456 * objects.
457 * <p>
458 * This does not attempt to check a name against current databases
459 * nor return any parsed components.
460 */
461 public static boolean validNameSyntaxForce(final CharSequence name)
462 {
463 //System.err.println(" [validNameSyntax("+name+")]");
464 // DO INITIAL SET OF VERY QUICK TESTS.
465 if(!validNameSyntaxBasic(name))
466 { return(false); }
467 final int len = name.length();
468 // Must contain at least one of each . / and -.
469 final int firstSlash = TextUtils.indexOf(name, DIR_SEP);
470 final int lastSlash = TextUtils.lastIndexOf(name, DIR_SEP);
471 if(lastSlash >= len - MIN_FILENAME_LENGTH) // Final component too short.
472 { return(false); }
473 if((firstSlash < 1) || // No slash at all (-1) or first dir component absent (0) eg absolute path.
474 (TextUtils.lastIndexOf(name, WORD_SEP) < lastSlash)) // Must be at least one word and author in file component...
475 { return(false); }
476
477 //System.err.println(" [validNameSyntax("+name+"): quick tests passed]");
478
479 // Invalid initial component?
480 if(!validNameInitialComponentSyntax(name, firstSlash))
481 { return(false); }
482 //System.err.println(" [validNameSyntax("+name+"): validNameInitialComponentSyntax]");
483
484 // Check intermediate components (if any).
485 // Must be of the form _more[0-9A-Z]*
486 if(lastSlash != firstSlash)
487 {
488 //System.err.println(" [validNameSyntax("+name+"): checking intermediate dir names...]");
489 // Next part points to the _more prefix of the next
490 // intermediate directory.
491 nextDirComponent: for(int nextPart = firstSlash + 1; nextPart <= lastSlash; )
492 {
493 //System.err.println(" [validNameSyntax("+name+"): looking at "+name.substring(nextPart)+"]");
494 if(!TextUtils.regionMatches(name, nextPart, intermediateDirPrefix, 0, _iDirPrefixLen))
495 { return(false); }
496 //System.err.println(" [validNameSyntax("+name+"): intermediate dir prefix OK]");
497
498 for(int tail = nextPart + _iDirPrefixLen; ; ++tail)
499 {
500 final char c = name.charAt(tail);
501 if(c == DIR_SEP)
502 {
503 // No problems found in dir name component.
504 nextPart = tail + 1; // Skip over the next slash.
505 //System.err.println(" [validNameSyntax("+name+"): found end of dir component]");
506 continue nextDirComponent; // Start on next component.
507 }
508 if(!(((c >= '0') && (c <= '9')) ||
509 ((c >= 'A') && (c <= 'Z'))))
510 { return(false); } // Whoops; invalid character.
511 //System.err.println(" [validNameSyntax("+name+"): dir char "+c+" OK]");
512 }
513 }
514 }
515 //System.err.println(" [validNameSyntax("+name+"): intermediate components OK]");
516
517 // Now thoroughly check the final component.
518 // Oh dear; this may construct a temporary object.
519 if(!validNameFinalComponentSyntax(name.subSequence(lastSlash + 1, len)))
520 { return(false); }
521 //System.err.println(" [validNameSyntax("+name+"): validNameFinalComponentSyntax]");
522 return(true);
523 }
524
525 /**A simple invariant comparator that sorts full exhibit names in a human-friendly order.
526 * This is essentially a case-insensitive sort on the file component.
527 * <p>
528 * Ties are broken by a normal lexical ordering on the full names.
529 * <p>
530 * Discardable/attribute words are not discarded
531 * nor otherwise treated specially for this comparison.
532 */
533 public static final Comparator<CharSequence> SIMPLE_SMART_ORDER = (new Comparator<CharSequence>(){
534 public final int compare(final CharSequence cs1, final CharSequence cs2)
535 {
536 final boolean cs1NEF = Name.ExhibitFull.class.equals(cs1.getClass());
537 // Compare short/file components, using run-time optimisation for ExhibitFull elements...
538 final boolean cs2NEF = Name.ExhibitFull.class.equals(cs2.getClass());
539 final int ciResult = TextUtils.CASE_INSENSITIVE_ORDER.compare(
540 cs1NEF ? ((Name.ExhibitFull)cs1).getShortName() : getFileComponent(cs1),
541 cs2NEF ? ((Name.ExhibitFull)cs2).getShortName() : getFileComponent(cs2));
542 if(ciResult != 0) { return(ciResult); }
543
544 // Needs a tie-break by full name.
545 if(cs1NEF && cs2NEF) { return(((Name.ExhibitFull) cs1).compareTo((Name.ExhibitFull) cs2)); } // Run-time optimisation for ExhibitFull elements.
546 return(TextUtils.compare(cs1, cs2));
547 }
548 });
549
550 /**Extract the file component (short name) of a full exhibit name, assuming the name is valid.
551 * Two exhibits should always be distinguishable by this component,
552 * also known as the "short" name.
553 * <p>
554 * If the argument is not a valid full exhibit name, the result is undefined.
555 * <p>
556 * See also ExhibitFullName.getShortName().
557 */
558 public static CharSequence getFileComponent(final CharSequence fullExhibitName)
559 {
560 assert((fullExhibitName instanceof Name.ExhibitFull) || validNameSyntax(fullExhibitName));
561 final int length = fullExhibitName.length();
562 return(fullExhibitName.subSequence(TextUtils.lastIndexOf(fullExhibitName, DIR_SEP, length - MIN_FILENAME_LENGTH) + 1, length));
563 }
564
565 /**Extract the category component (top directory) of a full exhibit name, assuming the name is valid.
566 * If the argument is not a valid full exhibit name, the result is undefined.
567 */
568 public static CharSequence getCategoryComponent(final CharSequence fullExhibitName)
569 {
570 assert((fullExhibitName instanceof Name.ExhibitFull) || validNameSyntax(fullExhibitName));
571 return(fullExhibitName.subSequence(0, TextUtils.indexOf(fullExhibitName, DIR_SEP, 1)));
572 }
573
574 /**Extract the full directory component of a full exhibit name, assuming the name is valid.
575 * This does not include the trailing directory separator.
576 * <p>
577 * If the argument is not a valid full exhibit name, the result is undefined.
578 */
579 public static CharSequence getDirComponent(final CharSequence fullExhibitName)
580 {
581 assert((fullExhibitName instanceof Name.ExhibitFull) || validNameSyntax(fullExhibitName));
582 return(fullExhibitName.subSequence(0, TextUtils.lastIndexOf(fullExhibitName, DIR_SEP, fullExhibitName.length() - MIN_FILENAME_LENGTH)));
583 }
584
585 /**Find the index of the end of the attribute words for a short or long exhibit name; strictly positive.
586 * @param exhibitName valid full or short exhibit name; never null
587 */
588 public static int getEndOfAttrWords(final CharSequence exhibitName)
589 {
590 // Find the end of the attribute words.
591 final int l = exhibitName.length();
592 final int lastDash = TextUtils.lastIndexOf(exhibitName, WORD_SEP, l - 3); // Before author, eg as in "-A.a".
593 // Skip back one word to omit number-in-series value if present.
594 final int endOfAttrWords =
595 (getNumberInSeriesComponentAsString(exhibitName) == null) ?
596 lastDash : TextUtils.lastIndexOf(exhibitName, WORD_SEP, lastDash-1);
597 return(endOfAttrWords);
598 }
599
600 /**Find the index of the end of the main words for a short or long exhibit name; strictly positive.
601 * @param exhibitName valid full or short exhibit name; never null
602 * @param lastSlash position of last DIR_SEP or -1 for a short name
603 * @param endOfAttrWords as returned by getEndOfAttrWords()
604 * @param allAttrWords a Set of all legal attribute words (String values);
605 * may be empty but not null
606 */
607 public static int getEndOfMainWords(final CharSequence exhibitName,
608 final int lastSlash,
609 final int endOfAttrWords,
610 final Set<String> allAttrWords)
611 {
612 assert(exhibitName.charAt(endOfAttrWords) == WORD_SEP);
613 // End of main word; same as end of attr words if no attr words.
614 int endOfMainWords = endOfAttrWords; // Always pointing to a dash.
615
616 // If there are no attribute words then we can skip some processing...
617 if(!allAttrWords.isEmpty())
618 {
619 // Find end of first main word,
620 // given that there must be at least one (of at least length 1).
621 final int endOfFirstMainWord = TextUtils.indexOf(exhibitName, WORD_SEP, lastSlash+2);
622
623 // Work back a word at a time leaving the boundary after the
624 // first non-attribute word.
625 while(endOfMainWords > endOfFirstMainWord)
626 {
627 //assert(fullExhibitName.charAt(endOfMainWords, WORD_SEP));
628
629 // We can skip back 2 each time since each word must be at least
630 // a single letter long preceded by a dash.
631 final int previousDash = TextUtils.lastIndexOf(exhibitName, WORD_SEP, endOfMainWords-2);
632
633 final String putativeAttrWord =
634 exhibitName.subSequence(previousDash + 1, endOfMainWords).toString();
635 if(!allAttrWords.contains(putativeAttrWord)) { break; }
636 // If the word was an attribute word then move the boundary back.
637 endOfMainWords = previousDash;
638 }
639 }
640
641 return(endOfMainWords);
642 }
643
644 /**Extract the main words (stem) component of a valid short exhibit name; never null nor empty.
645 * As getMainWordsComponentFrom() but optimised for this common case,
646 * and should be more efficient than extracting from a full name as less text to scan.
647 *
648 * @param allAttrWords a Set of all legal attribute words (String values);
649 * may be empty but not null
650 */
651 public static CharSequence getMainWordsComponentFromShortName(
652 final CharSequence shortExhibitName,
653 final Set<String> allAttrWords)
654 {
655 if(null == shortExhibitName) { throw new IllegalArgumentException(); }
656 assert((shortExhibitName instanceof Name.ExhibitShort) || validNameFinalComponentSyntax(shortExhibitName)) : ("not valid short name: " + shortExhibitName);
657
658 final int endOfAttrWords = getEndOfAttrWords(shortExhibitName);
659 final int endOfMainWords = getEndOfMainWords(shortExhibitName, -1, endOfAttrWords, allAttrWords);
660 final CharSequence result = shortExhibitName.subSequence(0, endOfMainWords);
661 // Should be identical result to getMainWordsComponent().
662 // assert(TextUtils.contentEquals(result, getMainWordsComponent(shortExhibitName, allAttrWords)));
663 return(result);
664 }
665
666 /**Find end of main stem and of attribute words of the supplied short or full exhibit name.
667 * This returns a three-element array, of which:
668 * <ul>
669 * <li>Element 0 is the position of the separator immediately
670 * before the first main word (a DIR_SEP),
671 * -1 for a short name
672 * <li>Element 1 is the position (index in the input String)
673 * of the separator immediately following the final word of
674 * the main stem of the exhibit name.
675 * All the words of the main stem precede it,
676 * and any attribute words and number-in-series value follow it.
677 * <li>Element 2 of the result is the position (index in the input String)
678 * of the separator immediately following the final attribute word of
679 * the exhibit name.
680 * (If there are no attribute words this will be the same value
681 * as element 1.)
682 * Any number-in-series value follows it.
683 * </ul>
684 * <p>
685 * This has to be passed a set of all valid attribute words
686 * (as Strings which meet the requirements of validAttributeWord())
687 * to be able to compute this boundary.
688 * <p>
689 * If the argument is not a valid full exhibit name, the result is undefined.
690 *
691 * @param allAttrWords a Set of all legal attribute words (String values);
692 * may be empty but not null
693 */
694 public static int[] getMainAndAttrWordComponentBoundaries(
695 final CharSequence exhibitName,
696 final Set<String> allAttrWords)
697 {
698 if(null == exhibitName) { throw new IllegalArgumentException(); }
699 assert((exhibitName instanceof Name.ExhibitFull) || (exhibitName instanceof Name.ExhibitShort) ||
700 validNameSyntax(exhibitName) || validNameFinalComponentSyntax(exhibitName)) : ("not valid full or short name: " + exhibitName);
701
702 final int endOfAttrWords = getEndOfAttrWords(exhibitName);
703
704 // The last slash is the character before the first main word; -1 if no slash because a short name.
705 final int l = exhibitName.length();
706 final int lastSlash = TextUtils.lastIndexOf(exhibitName, DIR_SEP, l - MIN_FILENAME_LENGTH);
707
708 // End of main word; same as end of attr words if no attr words.
709 final int endOfMainWords = getEndOfMainWords(exhibitName, lastSlash, endOfAttrWords, allAttrWords);
710
711 // assert (lastSlash>0) && (lastSlash < endOfMainWords) && (endOfMainWords <= endOfAttrWords) && (endOfAttrWords <= lastDash);
712 //System.out.println("lastSlash, endOfMainWords, endOfAttrWords: " + lastSlash + ", " + endOfMainWords + ", " + endOfAttrWords);
713
714 // Return offsets...
715 return(new int[]{ lastSlash, endOfMainWords, endOfAttrWords });
716 }
717
718
719 /**Extract the main words (stem) component of a valid full or short exhibit name; never null nor empty.
720 * There is always at least one main word;
721 * so the result is always non-null and non-empty.
722 * <p>
723 * (This does not end nor start with a separator.)
724 * <p>
725 * If the argument is not a valid full or short exhibit name, the result is undefined.
726 *
727 * @param allAttrWords a Set of all legal attribute words (String values)
728 */
729 public static CharSequence getMainWordsComponent(final CharSequence exhibitName, final Set<String> allAttrWords)
730 {
731 // We rely on getMainAndAttrWordComponentBoundaries() to check args for us.
732 final int[] offsets = getMainAndAttrWordComponentBoundaries(exhibitName, allAttrWords);
733 // assert(exhibitName.charAt(offsets[0]+1) != WORD_SEP); // First char returned should not be a separator.
734 // assert(exhibitName.charAt(offsets[1]-1) != WORD_SEP); // Last char returned should not be a separator.
735 // assert(exhibitName.charAt(offsets[1]) == WORD_SEP); // Char after last returned should be a separator.
736 return(exhibitName.subSequence(offsets[0]+1, offsets[1]));
737 }
738
739
740 /**Count the main words int the (stem) component of a valid full or short exhibit name; strictly positive.
741 *
742 * @param allAttrWords a Set of all legal attribute words (String values)
743 */
744 public static int getMainWordsCount(final CharSequence exhibitName, final Set<String> allAttrWords)
745 {
746 // assert(validNameSyntax(fullExhibitName)) : ("not valid full name: " + fullExhibitName);
747 int wordCount = 1; // One more word than separator...
748 final CharSequence mwc = ExhibitName.getMainWordsComponent(exhibitName, Collections.<String>emptySet());
749 for(int i = mwc.length(); --i >= 0; )
750 { if(mwc.charAt(i) == ExhibitName.WORD_SEP) { ++wordCount; } }
751 assert(wordCount > 0);
752 return(wordCount);
753 }
754
755 /**Return Enumeration over main words of a valid full or short name; never null, never empty if the name is well-formed.
756 * Uses StringTokenizer, thus slow and inefficient.
757 */
758 public static Enumeration<?> getMainWords(final CharSequence exhibitName, final Set<String> allAttrWords)
759 {
760 return((new StringTokenizer(getMainWordsComponent(exhibitName, allAttrWords).toString(), WORD_SEPS)));
761 }
762
763 /**Extract the attribute words component of a full exhibit name, assuming the name is valid.
764 * If there are no attribute words this returns null,
765 * else the result is non-empty
766 * and is the fragment of the full name containing the attribute word
767 * with the words separated as usual.
768 * <p>
769 * If the argument is not a valid full exhibit name, the result is undefined.
770 *
771 * @param allAttrWords a Set of all legal attribute words (String values)
772 */
773 public static CharSequence getAttributeWordsComponent(final CharSequence fullExhibitName, final Set<String> allAttrWords)
774 {
775 assert((fullExhibitName instanceof Name.ExhibitFull) || validNameSyntax(fullExhibitName));
776 final int[] offsets = getMainAndAttrWordComponentBoundaries(fullExhibitName, allAttrWords);
777 if(offsets[1] == offsets[2]) { return(null); } // No attr words.
778 return(fullExhibitName.subSequence(offsets[1]+1, offsets[2]));
779 }
780
781 /**Extract the attribute words component of a full exhibit name as an Enumeration of String, assuming the name is valid.
782 * If there are no attribute words this returns null,
783 * else the result is non-empty
784 * and is an Enumeration of String values of the attributes in order.
785 * <p>
786 * If the argument is not a valid full exhibit name, the result is undefined.
787 *
788 * @param allAttrWords a Set of all legal attribute words (String values)
789 */
790 public static Enumeration<?> getAttributeWordsComponentEnumeration(final CharSequence fullExhibitName, final Set<String> allAttrWords)
791 {
792 assert((fullExhibitName instanceof Name.ExhibitFull) || validNameSyntax(fullExhibitName));
793 final CharSequence s = getAttributeWordsComponent(fullExhibitName, allAttrWords);
794 if(s == null) { return(null); }
795 return(new StringTokenizer(s.toString(), WORD_SEPS));
796 }
797
798 /**Extract the attribute words component of a full exhibit name as a SortedSet of String, assuming the name is valid; never null.
799 * If there are no attribute words this returns
800 * a (fixed, immutable) empty set.
801 * <p>
802 * Duplicates attribute words are automatically eliminated
803 * <p>
804 * If the argument is not a valid full exhibit name, the result is undefined.
805 *
806 * @param allAttrWords a Set of all legal attribute words (String values)
807 *
808 * @return non-null, de-duped, alpha-sorted attribute words from the name
809 */
810 public static SortedSet<String> getAttributeWordsComponentSortedSet(final CharSequence fullExhibitName,
811 final Set<String> allAttrWords)
812 {
813 assert((fullExhibitName instanceof Name.ExhibitFull) || validNameSyntax(fullExhibitName));
814 final CharSequence s = getAttributeWordsComponent(fullExhibitName, allAttrWords);
815 if(s == null) { return(NO_ATTR_WORDS); }
816
817 final SortedSet<String> result = new TreeSet<String>();
818 final StringTokenizer st = new StringTokenizer(s.toString(), WORD_SEPS);
819 while(st.hasMoreElements()) { result.add(st.nextToken()); }
820
821 return(result);
822 }
823
824 /**Immutable empty attribute word set. */
825 private static final SortedSet<String> NO_ATTR_WORDS =
826 Collections.unmodifiableSortedSet(new TreeSet<String>());
827
828 /**Extract the number-in-series component of a full or short exhibit name, assuming the name is valid.
829 * If the argument is not a valid full or short exhibit name then the result is undefined.
830 * <p>
831 * A missing number-in-series value causes us to return null.
832 * <p>
833 * This is:
834 * <ul>
835 * <li>the final word before the author initials,
836 * <li>must consist entirely of digits,
837 * <li>is always non-negative,
838 * <li>is always to be interpreted in radix 10,
839 * so "010" = "10" = 10 (ie ten),
840 * <li>and is not the only word.
841 * </ul>
842 */
843 public static CharSequence getNumberInSeriesComponentAsString(final CharSequence exhibitName)
844 {
845 assert(((exhibitName instanceof Name.ExhibitFull) || validNameSyntax(exhibitName)) || ((exhibitName instanceof Name.ExhibitShort) || validNameFinalComponentSyntax(exhibitName)));
846 // The number-in-series value, if present,
847 // is the final word and contains only digits
848 // and must not be the only word.
849 final int l = exhibitName.length();
850 final int lastDash = TextUtils.lastIndexOf(exhibitName, WORD_SEP, l - 3); // Must be at least one-char auth and extension, eg "-A.a".
851 final int previousDash = TextUtils.lastIndexOf(exhibitName, WORD_SEP, lastDash-1);
852 if(previousDash <= 0) { return(null); } // Only word?
853 final CharSequence putativeNiS = exhibitName.subSequence(previousDash+1, lastDash);
854 for(int i = putativeNiS.length(); --i >= 0; )
855 {
856 final char c = putativeNiS.charAt(i);
857 if((c < '0') || (c > '9')) { return(null); } // Not pure number.
858 }
859 // Looks OK!
860 return(putativeNiS);
861 }
862
863 /**Extract the number-in-series component of a full exhibit name as a non-negative int, assuming the name is valid.
864 * If the argument is not a valid full exhibit name, the result is undefined.
865 * <p>
866 * A missing number-in-series value causes us to return zero.
867 * <p>
868 * This is:
869 * <ul>
870 * <li>the final word before the author initials,
871 * <li>must consist entirely of digits,
872 * <li>is always non-negative,
873 * <li>is always to be interpreted in radix 10,
874 * so "010" = "10" = 10 (ie ten),
875 * <li>and is not the only word.
876 * </ul>
877 *
878 * @return positive number-in-series value, or zero if absent
879 */
880 public static int getNumberInSeriesComponent(final CharSequence fullExhibitName)
881 {
882 final CharSequence nis = getNumberInSeriesComponentAsString(fullExhibitName);
883 if(nis == null) { return(0); }
884 return(Integer.parseInt(nis.toString(), 10));
885 }
886
887 /**Extract the author component of a valid full or short exhibit name, assuming the name is valid.
888 * If the argument is not a valid full exhibit name, the result is undefined
889 * <em>unless</em> the name is the final (file/short) component of a valid name.
890 */
891 public static CharSequence getAuthorComponent(final CharSequence exhibitName)
892 {
893 assert(((exhibitName instanceof Name.ExhibitFull) || validNameSyntax(exhibitName)) || ((exhibitName instanceof Name.ExhibitShort) || validNameFinalComponentSyntax(exhibitName)));
894 // The author initials lie between the last dash and the last dot.
895 final int l = exhibitName.length();
896 final int lastDash = TextUtils.lastIndexOf(exhibitName, WORD_SEP, l - 3); // Must be at least one-char auth and extension, eg "-A.a".
897 final int lastDot = TextUtils.lastIndexOf(exhibitName, '.', l - 1); // Must be at least one-char extension, eg ".a".
898 return(exhibitName.subSequence(lastDash+1, lastDot));
899 }
900
901 /**Extract the extension (without dot) of a valid full or short exhibit name, assuming the name is valid.
902 * If the argument is not a valid full exhibit name, the result is undefined
903 * <em>unless</em> the name is the final (file/short) component of a valid name.
904 */
905 public static CharSequence getExtensionComponent(final CharSequence exhibitName)
906 {
907 assert(((exhibitName instanceof Name.ExhibitFull) || validNameSyntax(exhibitName)) || ((exhibitName instanceof Name.ExhibitShort) || validNameFinalComponentSyntax(exhibitName)));
908 final int l = exhibitName.length();
909 final int lastDot = TextUtils.lastIndexOf(exhibitName, '.', l - 1); // Must be at least one-char extension, eg ".a".
910 return(exhibitName.subSequence(lastDot+1, l));
911 }
912 }