001    /*
002    Copyright (c) 1996-2011, Damon Hart-Davis
003    All rights reserved.
004    
005    Redistribution and use in source and binary forms, with or without
006    modification, are permitted provided that the following conditions are
007    met:
008    
009      * Redistributions of source code must retain the above copyright
010        notice, this list of conditions and the following disclaimer.
011    
012      * Redistributions in binary form must reproduce the above copyright
013        notice, this list of conditions and the following disclaimer in the
014        documentation and/or other materials provided with the
015        distribution.
016    
017    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
018    IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
019    TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
020    PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
021    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
022    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
023    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
024    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
025    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
026    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
027    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
028    */
029    package org.hd.d.pg2k.svrCore;
030    
031    import java.util.Collection;
032    import java.util.Comparator;
033    import java.util.SortedSet;
034    import java.util.TreeSet;
035    
036    import org.w3c.dom.Element;
037    import org.w3c.dom.NamedNodeMap;
038    import org.w3c.dom.Node;
039    
040    /**Some simple common text utilities of scope throughout the application.
041     */
042    public final class TextUtils
043        {
044        /**Prevent construction of an instance. */
045        private TextUtils() { }
046    
047        /**Suffixes used by sizeAsText. */
048        private static final String sizeSuffixes[] = { "B", "kB", "MB", "GB", "TB", "EB" };
049        /**Renders size in bytes as text, abbreviated if requested.
050         * Can produce normal or abbreviated output.
051         */
052        public static final String sizeAsText(final long size, final boolean abbrev)
053            {
054            final StringBuilder sb = new StringBuilder(abbrev ? 5 : 16);
055            if(abbrev)
056                {
057                long divisor = 1;
058                for(int i = 0; i < sizeSuffixes.length; ++i, divisor *= 1024)
059                    {
060                    if(size < divisor * 1000)
061                        {
062                        // Takes 3 or less digits with this suffix, so show it!
063                        sb.append((size + divisor/2) / divisor);
064                        sb.append(sizeSuffixes[i]);
065                        return(sb.toString()); // Done.
066                        }
067                    }
068                // We may fall through to the default case...
069                }
070            sb.append(size);
071            sb.append("Byte");
072            return(sb.toString());
073            }
074    
075        /**Returns true if given String is 7-bit clean, is is pure ASCII, or is null.
076         */
077        public static boolean isASCII7(final String s)
078            {
079            if(s == null) { return(true); }
080            for(int i = s.length(); --i >= 0; )
081                { if((s.charAt(i) & ~0x7f) != 0) { return(false); } }
082            return(true); // Seems OK.
083            }
084    
085        /**Recursively copy second Node and contents into first node.
086         * This works even when the second Node is from a different Document to the first
087         * and importNode() does not work (eg due to JDK1.5 impl bug).
088         */
089        public static final void importCopy(final Node dest, final Node src)
090            {
091            if((dest == null) || (src == null))
092                { throw new IllegalArgumentException(); }
093    
094            final Element cp = dest.getOwnerDocument().createElement(src.getNodeName());
095    
096            // Deal with attrs, etc.
097            final boolean hasAttributes = (src.getNodeType() == Node.ELEMENT_NODE) &&
098                    (src.getAttributes().getLength() > 0);
099            if(hasAttributes)
100                {
101                final NamedNodeMap attribs = src.getAttributes();
102                for(int i = 0; i < attribs.getLength(); i++)
103                    { cp.setAttribute(attribs.item(i).getNodeName(), attribs.item(i).getNodeValue()); }
104                }
105    
106            // Copy in any node value.
107            cp.setNodeValue(src.getNodeValue());
108    
109            // Now add the new node to the destination tree.
110            dest.appendChild(cp);
111    
112            // Recursively copy children.
113            for(Node child = src.getFirstChild(); child != null; child = child.getNextSibling())
114                { importCopy(cp, child); }
115            }
116    
117        /**String to write as line terminator when converting to XML.
118         * A simple LF would suit UNIX-like systems
119         * though would probably be accepted by most.
120         * A bulkier CRLF is more like the Internet "standard".
121         */
122        private static final String XML_LINE_TERM = "\r\n";
123    
124        /**Write DOM tree as XML/XHTML String; never "" nor null.
125         * Can write as terse/efficient single-line form or longer more human-friendly format.
126         * <p>
127         * When writing XHTML we make text nodes XML/HTML-safe, though allow embedded entities.
128         *
129         * @param node  DOM tree root; never null
130         * @param toXHTML  if true write with formatting suitable to include directly in HTML/XHTML for human consumption
131         *     (eg using dl/ul/ol nested lists to produce formatted text representing the structure)
132         *     rather than a pure XML representation of the Node tree itself
133         * @param terse  if true write as compactly as possible, else make more human-readable if possible
134         */
135        public static final String toXML(final Node node, final boolean toXHTML, final boolean terse)
136            {
137            final StringBuilder result = new StringBuilder(64);
138            toXML(result, node, toXHTML, terse);
139            return(result.toString());
140            }
141    
142        /**Write DOM tree as XML/XHTML String; appends to supplied StringBuilder.
143         * Can write as terse/efficient single-line form or longer more human-friendly format.
144         * <p>
145         * When writing XHTML we make text nodes XML/HTML-safe, though allow embedded entities.
146         *
147         * @param node  DOM tree root; never null
148         * @param toXHTML  if true write with formatting suitable to include directly in HTML/XHTML for human consumption
149         *     (eg using dl/ul/ol nested lists to produce formatted text representing the structure)
150         *     rather than a pure XML representation of the Node tree itself
151         * @param terse  if true write as compactly as possible, else make more human-readable if possible
152         */
153        public static final void toXML(final StringBuilder result,
154                                       final Node node, final boolean toXHTML, final boolean terse)
155            {
156            if((node == null) || (result == null))
157                { throw new IllegalArgumentException(); }
158    
159            final String nodeValue = node.getNodeValue();
160    
161            final boolean hasValue = (node.getNodeValue() != null);
162    
163            final boolean hasChildren = (node.getFirstChild() != null);
164    
165            final boolean hasAttributes = (node.getNodeType() == Node.ELEMENT_NODE) &&
166                    (node.getAttributes().getLength() > 0);
167    
168            if(toXHTML) // XHTML/HTML formatted text.
169                {
170                result.append("<dl compact><dt>");
171                result.append("<b>").append(node.getNodeName()).append("</b>");
172    
173                if(hasAttributes)
174                    {
175                    final NamedNodeMap attribs = node.getAttributes();
176                    for(int i = 0; i < attribs.getLength(); i++)
177                        {
178                        result.append(' ');
179                        result.append(attribs.item(i).getNodeName());
180                        result.append("=\"");
181                        result.append(escapeHTMLMetaChars(attribs.item(i).getNodeValue()));
182                        result.append('"');
183                        }
184                    }
185    
186                // Write any value with < > & " ' escaped...
187                if(hasValue) { result.append(" [").append(escapeHTMLMetaChars(nodeValue.trim())).append("] "); }
188    
189                // Recursively deal with any child nodes...
190                if(hasChildren)
191                    {
192                    result.append("<dd>");
193                    for(Node child = node.getFirstChild(); child != null; child = child.getNextSibling())
194                        { toXML(result, child, toXHTML, terse); }
195                    }
196    
197                result.append("</dl>");
198                }
199            else // Pure XML.
200                {
201                result.append('<').append(node.getNodeName());
202    
203                if(hasAttributes)
204                    {
205                    final NamedNodeMap attribs = node.getAttributes();
206                    for(int i = 0; i < attribs.getLength(); i++)
207                        {
208                        result.append(' ');
209                        result.append(attribs.item(i).getNodeName());
210                        result.append("=\"");
211                        result.append(escapeHTMLMetaChars(attribs.item(i).getNodeValue()));
212                        result.append('"');
213                        }
214                    }
215    
216                // If we have no value or children
217                // then we can use the <.../> format with no separate closing tag.
218                final boolean unitag = (!hasValue && !hasChildren);
219    
220                if(unitag) { result.append('/'); }
221    
222                result.append('>');
223    
224                if(!unitag)
225                    {
226                    if(hasValue) { result.append(escapeHTMLMetaChars(nodeValue.trim())); }
227    
228                    for(Node child = node.getFirstChild(); child != null; child = child.getNextSibling())
229                        { result.append(toXML(child, toXHTML, terse)); }
230    
231                    result.append("</").append(node.getNodeName()).append('>');
232                    }
233                }
234    
235            if(!terse) { result.append(XML_LINE_TERM); } // End the line for readability...
236            }
237    
238        /**Rewrite HTML so that it displays as "raw" text and is safe to use in attribute values.
239         * Meta-characters (&lt;, &gt;, &amp; &quot; ') are rewritten
240         * to entity codes so that when shown on screen it looks like
241         * the raw HTML text when displayed in-line in HTML,
242         * and so as to avoid problems when embedded in XML and HTML.
243         * <p>
244         * If the input text does not contain these meta-characters
245         * then it is returned unchanged.
246         * <p>
247         * Additionally, all characters &lt; 32 (ASCII control characters)
248         * are converted to spaces.
249         * <p>
250         * If null is passed in, then this returns null,
251         * to simplify its use in some cases where a null might be present.
252         *
253         * @param in  String possibly containing HTML/XML meta-characters
254         */
255        public static String escapeHTMLMetaChars(final String in)
256            {
257            if(in == null) { return(null); }
258    
259            // Check for "meta" characters in input.
260            // If none found then we can return the input String as-is.
261            boolean foundMetaChar = false;
262            for(int i = in.length(); --i >= 0; )
263                {
264                final char c = in.charAt(i);
265                if((c < 32) || ("<>\"'&".indexOf(c) != -1))
266                    {
267                    foundMetaChar = true;
268                    break;
269                    }
270                }
271            if(!foundMetaChar) { return(in); }
272    
273            // Take a working writable copy of the input.
274            final StringBuilder work = new StringBuilder(in);
275    
276            // Work backwards through the string
277            // (for efficiency and simplicity)
278            // carefully escaping the meta-characters.
279            for(int i = work.length(); --i >= 0; )
280                {
281                final char c = work.charAt(i);
282                String replacement;
283                switch(c)
284                    {
285                    case '<': { replacement =  "&lt;"; break; }
286                    case '>': { replacement =  "&gt;"; break; }
287                    case '&': { replacement =  "&amp;"; break; }
288                    case '"': { replacement =  "&quot;"; break; }
289                    case '\'': { replacement =  "&#039;"; break; }
290                    default:
291                        {
292                        // Replace ASCII control codes...
293                        if(c < 32) { replacement = " "; break; }
294    
295                        // No replacement needed.
296                        continue;
297                        }
298                    }
299    
300                // Delete the old character.
301                work.deleteCharAt(i);
302                // Insert the new string
303                // (we could merge these two ops for efficiency).
304                work.insert(i, replacement);
305                }
306    
307            return(work.toString());
308            }
309    
310        /**Private to encode8To6()/decode8To6(); automagically created on first access. */
311        private static final class Base64Cache
312            {
313            /**Prevent construction of any instance. */
314            private Base64Cache() {}
315    
316            /**Private to encode8To6()/decode8To6(). */
317            private static final char[] alphabet = {
318                'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',  //  0 to  7
319                'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',  //  8 to 15
320                'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',  // 16 to 23
321                'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',  // 24 to 31
322                'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',  // 32 to 39
323                'o', 'p', 'q', 'r', 's', 't', 'u', 'v',  // 40 to 47
324                'w', 'x', 'y', 'z', '0', '1', '2', '3',  // 48 to 55
325                '4', '5', '6', '7', '8', '9', '+', '/'}; // 56 to 63
326    
327            /**Index/value of each legit 7-bit-ASCII character from the alphabet.
328             * All other (ie illegal) values map to -1.
329             * <p>
330             * This is effectively the reverse mapping for alphabet[].
331             */
332            private static final int valueOf[] = new int[128];
333    
334            /**Initialise valueOf[]. */
335            static
336                {
337                for(int i = valueOf.length; --i >= 0; ) { valueOf[i] = -1; }
338                for(int i = alphabet.length; --i >= 0; ) { valueOf[alphabet[i]] = i; }
339                }
340    
341            /**Gets valueOf given char; non-negative.
342             * @throws IllegalArgumentException for invalid char (not from allowed alphabet)
343             * @return value in range 0--63 inclusive
344             */
345            public static int getValueOf(final char c)
346                {
347                if(c > 128) { throw new IllegalArgumentException(); }
348                final int v = valueOf[c];
349                if(v == -1) { throw new IllegalArgumentException(); }
350                return(v);
351                }
352            }
353    
354        /**Decode to a byte array (8 bit) from ASCII Base-64 (6 bit); never null.
355         * The input data must always be a multiple of 4 characters,
356         * and all character must come from the standard Base64 set [A-Za-z0-9+/=].
357         *
358         * @param base64Text  data in base-46, eg as encoded by encode8To6(); never null
359         * @return binary data; never null
360         */
361        public static byte[] decode8To6(final String base64Text)
362            {
363            if(base64Text == null) { throw new IllegalArgumentException(); }
364            final int inputLen = base64Text.length();
365            // Handle empty input specially.
366            if(inputLen == 0) { return(new byte[0]); }
367            // Check input is legitimate length (eg appropriately padded with trailing '='s).
368            if((inputLen & 3) != 0)
369                { throw new IllegalArgumentException("input length must be multiple of 4: was "+base64Text.length()); }
370    
371            // Compute the maximum result length (absent trailing '='s).
372            final int maxResultSize = 3 * (inputLen / 4);
373            // Compute the actual result size.
374            final boolean oneTrailingEquals = (base64Text.charAt(inputLen-1) == '=');
375            final boolean twoTrailingEquals = oneTrailingEquals && (base64Text.charAt(inputLen-2) == '=');
376            final int resultSize = twoTrailingEquals ? (maxResultSize-2) :
377                                       (oneTrailingEquals ? (maxResultSize-1) :
378                                           maxResultSize);
379            // Make the array for the final result...
380            final byte result[] = new byte[resultSize];
381    
382            // Decoding TO block of 3 bytes at index i in the result
383            // FROM block of 4 characters at index j in the source.
384            for(int i = 0, j = 0; i < resultSize; i += 3, j += 4)
385                {
386                // Special treatment for the last block if it is partial.
387                final boolean partialLastBlock = ((i+3) > resultSize);
388    
389                // Always the first two chars of input block must be real encoded data.
390                final int v0 = Base64Cache.getValueOf(base64Text.charAt(j+0));
391                final int v1 = Base64Cache.getValueOf(base64Text.charAt(j+1));
392                // Always at least one output byte.
393                result[i+0] = (byte)((v0 << 2)         + (v1 >>> 4)); // b0 = v0v0v0v0 v0v0v1v1
394    
395                if(!partialLastBlock)
396                    {
397                    // Full block; two more bytes to decode from three more input chars.
398                    final int v2 = Base64Cache.getValueOf(base64Text.charAt(j+2));
399                    final int v3 = Base64Cache.getValueOf(base64Text.charAt(j+3));
400                    result[i+1] = (byte)(((v1 & 0xf) << 4) + (v2 >>> 2)); // b1 = v1v1v1v1 v2v2v2v2
401                    result[i+2] = (byte)(((v2 & 3) << 6)   +  v3       ); // b2 = v2v2v3v3 v3v3v3v3
402                    }
403                else if(!twoTrailingEquals)
404                    {
405                    // Partial last block with more than one byte to decode from one more input char.
406                    final int v2 = Base64Cache.getValueOf(base64Text.charAt(j+2));
407                    result[i+1] = (byte)(((v1 & 0xf) << 4) + (v2 >>> 2)); // b1 = v1v1v1v1 v2v2v2v2
408                    }
409                }
410    
411            return(result);
412            }
413    
414        /**Encode a byte array (8 bit) in ASCII Base-64 (6 bit); never null.
415         * With thanks to Apache.
416         * @param data8Bit  binary input data; never null
417         * @return ASCII base-64 representation as String; never null
418         */
419        public static String encode8To6(final byte[] data8Bit)
420            {
421            final StringBuilder sb = new StringBuilder(data8Bit.length * 2);
422    
423            int i = 0; // Index in data8Bit[].
424            while((i + 3) <= data8Bit.length)
425                {
426                int bits24 = (data8Bit[i++] & 0xFF) << 16;
427                bits24 |= (data8Bit[i++] & 0xFF) << 8;
428                bits24 |= (data8Bit[i++] & 0xFF);
429                sb.append(Base64Cache.alphabet[(bits24 & 0x00FC0000) >> 18]);
430                sb.append(Base64Cache.alphabet[(bits24 & 0x0003F000) >> 12]);
431                sb.append(Base64Cache.alphabet[(bits24 & 0x00000FC0) >> 6]);
432                sb.append(Base64Cache.alphabet[(bits24 & 0x0000003F)]);
433                }
434    
435            if(data8Bit.length - i == 2)
436                {
437                int bits24 = (data8Bit[i] & 0xFF) << 16;
438                bits24 |= (data8Bit[i + 1] & 0xFF) << 8;
439                sb.append(Base64Cache.alphabet[(bits24 & 0x00FC0000) >> 18]);
440                sb.append(Base64Cache.alphabet[(bits24 & 0x0003F000) >> 12]);
441                sb.append(Base64Cache.alphabet[(bits24 & 0x00000FC0) >> 6]);
442                sb.append('=');
443                }
444            else if(data8Bit.length - i == 1)
445                {
446                final int bits24 = (data8Bit[i] & 0xFF) << 16;
447                sb.append(Base64Cache.alphabet[(bits24 & 0x00FC0000) >> 18]);
448                sb.append(Base64Cache.alphabet[(bits24 & 0x0003F000) >> 12]);
449                sb.append('=');
450                sb.append('=');
451                }
452    
453            return(sb.toString());
454            }
455    
456        /**Sanitise text for XHTML/HTML/WML/XML use.
457         * The transformations are:
458         * <ul>
459         * <li>A null argument is converted to an empty string.
460         * <li>Whitespace is trimmed from the ends.
461         * <li>Converts any ``&lt;'' to ``{,'' ``&gt;'' to ``}''.
462         * <li>Unless in "allowEntities" modes, converts any ``&amp;'' to ``+'',
463         *     else leaves syntactically-correct entities intact and counts
464         *     as a single character.
465         * <li>Any non-ASCII characters are converted to a safe value;
466         *     if entity codes are permitted then these values are converted
467         *     to entity codes.
468         * <li>The result is limited to at most the specified number of characters.
469         * </ul>
470         * <p>
471         * Our simple definition of a syntactically-correct entity is that
472         * it may contain any selection of ASCII digits and letters and optionally
473         * a leading hash (`#'), but certainly no whitespace.  We may enforce a
474         * length limit.
475         * <p>
476         * If the input string is OK it is returned untouched.
477         * <p>
478         * TODO: Needs version with extra argument to allow entity sequences
479         *     (of the form '&amp;'xxx;) to be treated as single characters
480         *     and to delete any '&lt;'...'&gt;' sequences,
481         *     ie to do a more sophisticated job of sanitising XML/HTML
482         *     that has some simple markup (primarily entity codes needed for i18n).
483         * TODO: jUnit tests
484         *
485         * @param maxLen  if positive the output is limited to at most
486         *     this number of characters;
487         *     if &gt;= 3 truncated text is marked with a trailing ``...''
488         *     in the last three positions
489         * @param allowEntities  if true, treats HTML/XML entity sequences
490         *     as if single characters; the entities are vaguely tested for
491         *     correct syntax and may not be allowed if invalid
492         */
493        public static String sanitiseForXML(final String input,
494                                            final int maxLen,
495                                            final boolean allowEntities)
496            {
497            if(maxLen < 0) { throw new IllegalArgumentException(); }
498    
499            if(input == null) { return(""); }
500    
501            // First trim whitespace from either end.
502            String in = input.trim();
503    
504            // Always remove potentially-unsafe tag meta-characters.
505            in = in.replace('<', '{').replace('>', '}');
506    
507            // If non-null, the input string has had all entities replaced with
508            // just the leading '&' with the entity code
509            // (not including the trailing ';')
510            // at the matching index in this array,
511            // so all entities will appear as a single character.
512            String entities[] = null;
513    
514            // If not allowing entities zap all '&' chars...
515            if(!allowEntities)
516                { in = in.replace('&', '+'); }
517            // If we are allowing entities, and there are any,
518            // convert them all to placeholders.
519            else if(in.indexOf('&') >= 0)
520                {
521                // in may get truncated but not longer while this array is in use.
522                entities = new String[in.length()];
523    
524                // Break out the (valid) entities one at a time...
525                for(int nextAmp = -1; (nextAmp = in.indexOf('&', nextAmp + 1)) != -1; )
526                    {
527                    // Look for the next semicolon...
528                    final int nextSc = in.indexOf(';', nextAmp + 1);
529                    if(nextSc == -1)
530                        {
531                        // Whoops; if no more semicolons, then
532                        // all ampersands from current are bogus and we can stop.
533                        in = ((nextAmp >= 0) ? in.substring(0, nextAmp) : "") +
534                            (in.substring(nextAmp).replace('&', '+'));
535                        break;
536                        }
537    
538    
539                    // TODO: do more validity checking of a putative entity:
540                    // check that it letters and digits or a hash and all digits.
541    
542    
543                    // Now assume that the entity is valid,
544                    // so break it out into the entities array
545                    // leaving the '&' behind
546                    // and putting the body (not the trailing ';')
547                    // in the entities[] array at the '&' index.
548                    entities[nextAmp] = in.substring(nextAmp+1, nextSc);
549                    in = in.substring(0, nextAmp+1) +
550                         in.substring(nextSc+1);
551                    }
552                }
553    
554            // Truncate if necessary while
555            // any entities that we care about are "broken out"...
556            if((maxLen >= 0) && (in.length() > maxLen))
557                {
558                if(maxLen >= 3) { in = in.substring(0, maxLen-3) + "..."; }
559                else { in = in.substring(0, maxLen); }
560                }
561    
562            // If we broke out any entities,
563            // put any remaining ones back in-line.
564            if(entities != null)
565                {
566                final StringBuilder sb = new StringBuilder(in.length()); // At least this long.
567    
568                final int len = in.length();
569                for(int i = 0; i < len; ++i)
570                    {
571                    final char c = in.charAt(i);
572                    sb.append(c);
573                    if(c != '&')
574                        { continue; } // Usual case; no entity involved.
575    
576                    // Expand an entity.
577                    assert(entities[i] != null);
578                    sb.append(entities[i]).append(';');
579                    }
580    
581                // Convert back to expanded string.
582                in = sb.toString();
583    
584    //            entities = null; // Allow GC.
585                }
586    
587            // Look for any char outwith range 32--126.
588            // If one is found then replace every occurrence with ' ' and retrim at end,
589            // unless allowing entities in which case replace with an entity code.
590            for(int i = in.length(); --i >= 0; )
591                {
592                final char c = in.charAt(i);
593                if((c < 32) || (c > 126))
594                    {
595                    // Whole string needs fixing...
596                    final StringBuilder sb = new StringBuilder(2 * in.length());
597                    for(int j = 0; j < in.length(); ++j)
598                        {
599                        final char d = in.charAt(j);
600                        if((d < 32) || (d > 126))
601                            {
602                            // If allowing entities,
603                            // convert non-ASCII chars to entities.
604                            if(!allowEntities) { sb.append(' '); }
605                            else { sb.append("&#").append(0xffff & d).append(';'); }
606                            }
607                        else { sb.append(d); }
608                        }
609                    in = sb.toString();
610                    if(!allowEntities) { in = in.trim(); }
611                    break;
612                    }
613                }
614    
615            if(input.equals(in)) { return(input); } // Avoid making copies!
616            return(in);
617            }
618    
619    
620        /**Extension for a CharSequence that holds only 8-bit character values.
621         * The implementation of this may often hold the underlying data as a byte array.
622         */
623        public static interface CharSequence8Bit extends CharSequence
624            {
625            /**Return private copy (8-bit) text with each byte corresponding to one char; never null.
626             * May be far more efficient than converting via a String
627             * if the aim is to write directly to a file for example.
628             */
629            public byte[] toByteArray();
630            }
631    
632        /**Orders CharSequence objects as if by String.compareTo(); not null.
633         * If both arguments to compare() are the same reference (even null) this returns 0 immediately.
634         */
635        public static final Comparator<CharSequence> CASE_SENSITIVE_ORDER = new Comparator<CharSequence>(){
636            public int compare(final CharSequence cs1, final CharSequence cs2)
637                {
638                if(cs1 == cs2) { return(0); } // Optimisation...
639                final int l1 = cs1.length();
640                final int l2 = cs2.length();
641                final int l = Math.min(l1, l2);
642                for(int i = 0; i < l; ++i)
643                    {
644                    final char c1 = cs1.charAt(i);
645                    final char c2 = cs2.charAt(i);
646                    if(c1 != c2)
647                        { return(c1 - c2); }
648                    }
649                return(l1 - l2);
650                }
651            };
652    
653        /**Orders CharSequence objects as if by String.compareToIgnoreCase(); not null.
654         * If both arguments to compare() are the same reference (even null) this returns 0 immediately.
655         */
656        public static final Comparator<CharSequence> CASE_INSENSITIVE_ORDER = new Comparator<CharSequence>(){
657            public int compare(final CharSequence cs1, final CharSequence cs2)
658                {
659                if(cs1 == cs2) { return(0); } // Optimisation...
660                final int l1 = cs1.length();
661                final int l2 = cs2.length();
662                final int l = Math.min(l1, l2);
663                for(int i = 0; i < l; ++i)
664                    {
665                    char c1 = cs1.charAt(i);
666                    char c2 = cs2.charAt(i);
667                    if(c1 != c2)
668                        {
669                        c1 = Character.toUpperCase(c1);
670                        c2 = Character.toUpperCase(c2);
671                        if(c1 != c2)
672                            {
673                            c1 = Character.toLowerCase(c1);
674                            c2 = Character.toLowerCase(c2);
675                            if (c1 != c2)
676                                { return(c1 - c2); }
677                            }
678                        }
679                    }
680                return(l1 - l2);
681                }
682            };
683    
684        /**First index of specified character in given (non-null) CharSequence as for String.
685         * @return index of first occurrence of c, else -1 if no occurrence of c
686         */
687        public static final int indexOf(final CharSequence cs, final char c)
688            { return(TextUtils.indexOf(cs, c, 0)); }
689    
690        /**First index of specified character in given (non-null) CharSequence from/after specified index as for String.
691         * @return index of first occurrence of c, else -1 if no occurrence of c
692         */
693        public static final int indexOf(final CharSequence cs, final char c, final int fromIndex)
694            {
695            if(cs == null) { throw new IllegalArgumentException(); }
696            final int len = cs.length();
697            for(int i = Math.max(0, fromIndex); i < len; ++i)
698                { if(c == cs.charAt(i)) { return(i); } }
699            return(-1);
700            }
701    
702        /**Last index of specified character in given (non-null) CharSequence as for String.
703         * @return index of last occurrence of c, else -1 if no occurrence of c
704         */
705        public static final int lastIndexOf(final CharSequence cs, final char c)
706            { return(TextUtils.lastIndexOf(cs, c, Integer.MAX_VALUE)); }
707    
708        /**Last index of specified character in given (non-null) CharSequence from/before specified start index as for String.
709         * @return index of last occurrence of c, else -1 if no occurrence of c
710         */
711        public static final int lastIndexOf(final CharSequence cs, final char c, final int fromIndex)
712            {
713            if(cs == null) { throw new IllegalArgumentException(); }
714            final int len = cs.length();
715            for(int i = ((fromIndex >= len) ? len - 1 : fromIndex); --i >= 0; )
716                { if(c == cs.charAt(i)) { return(i); } }
717            return(-1);
718            }
719    
720        /**Checks that the specified region of two (non-null) CharSequences matches exactly (as for String).
721         */
722        public static boolean regionMatches(final CharSequence cs1, final int start1,
723                                            final CharSequence cs2, final int start2,
724                                            final int len)
725            {
726            if((null == cs1) || (null == cs2)) { throw new IllegalArgumentException(); }
727            if((start1 < 0) || (start2 < 0)) { throw new IllegalArgumentException(); }
728            if((start1 + len > cs1.length()) || (start2 + len > cs2.length())) { throw new IllegalArgumentException(); }
729            for(int i = len; --i >= 0; )
730                { if(cs1.charAt(start1 + i) != cs2.charAt(start2 + i)) { return(false); } }
731            return(true);
732            }
733    
734        /**Checks that two (non-null) CharSequences represent the same sequence of chars.
735         */
736        public static boolean contentEquals(final CharSequence cs1, final CharSequence cs2)
737            {
738            if((null == cs1) || (null == cs2)) { throw new IllegalArgumentException(); }
739            if(cs1 == cs2) { return(true); } // Optimisation.
740            final int len = cs1.length();
741            if(len != cs2.length()) { return(false); }
742            for(int i = len; --i >= 0; )
743                { if(cs1.charAt(i) != cs2.charAt(i)) { return(false); } }
744            return(true);
745            }
746    
747        /**Checks that two (non-null) CharSequences represent the same sequence of chars, ignoring case. */
748        public static boolean contentEqualsIgnoreCase(final CharSequence cs1, final CharSequence cs2)
749            {
750            if((null == cs1) || (null == cs2)) { throw new IllegalArgumentException(); }
751            if(cs1 == cs2) { return(true); } // Optimisation.
752            final int len = cs1.length();
753            if(len != cs2.length()) { return(false); }
754            // Use our case-insensitive comparator for consistency.
755            return(0 == CASE_INSENSITIVE_ORDER.compare(cs1, cs2));
756            }
757    
758        /**Compares two (non-null) CharSequences for lexical order. */
759        public static int compare(final CharSequence cs1, final CharSequence cs2)
760            {
761            if((null == cs1) || (null == cs2)) { throw new IllegalArgumentException(); }
762            final int l1 = cs1.length();
763            final int l2 = cs2.length();
764            final int l = Math.min(l1, l2);
765            for(int i = 0; i < l; ++i)
766                {
767                final char c1 = cs1.charAt(i);
768                final char c2 = cs2.charAt(i);
769                if(c1 != c2) { return(c1 - c2); }
770                }
771            return(l1 - l2);
772            }
773    
774        /**Returns true if the first sequence starts with the second, else false. */
775        public static boolean startsWith(final CharSequence mainText,
776                                         final CharSequence putativePrefix)
777            {
778            final int ppl = putativePrefix.length();
779            return((mainText.length() >= ppl) &&
780                regionMatches(mainText, 0, putativePrefix, 0, ppl));
781            }
782    
783        /**Returns true if the first sequence ends with the second, else false. */
784        public static boolean endsWith(final CharSequence mainText,
785                                       final CharSequence putativeSuffix)
786            {
787            final int psl = putativeSuffix.length();
788            final int ml = mainText.length();
789            return((ml >= psl) &&
790                regionMatches(mainText, ml-psl, putativeSuffix, 0, psl));
791            }
792    
793        /**Create and populate a SortedSet of CharSequence in natural/total case-sensitive order that will work with any mix of immutable CharSequence keys; never null.
794         * This may not be consistent with equals() if keys are of mixed types.
795         * <p>
796         * The result is not thread-safe and is mutable.
797         *
798         * @param csc  initial collection to populate from; can be null to avoid initial population
799         */
800        public static <T extends CharSequence> SortedSet<T> createCharSequenceSortedSet(final Collection<T> csc)
801            { return(createCharSequenceSortedSet(csc, TextUtils.CASE_SENSITIVE_ORDER)); }
802    
803        /**Create and populate a SortedSet of CharSequence in specified order that will work with any mix of immutable CharSequence keys; never null.
804         * This may not be consistent with equals() if keys are of mixed types.
805         * <p>
806         * The result is not thread-safe and is mutable.
807         *
808         * @param csc  initial collection to populate from; can be null to avoid initial population
809         * @param comparator   used to order the items; never null
810         */
811        public static <T extends CharSequence> SortedSet<T> createCharSequenceSortedSet(final Collection<T> csc, final Comparator<CharSequence> comparator)
812            {
813            if(null == comparator) { throw new IllegalArgumentException(); }
814            final SortedSet<T> result = new TreeSet<T>(comparator);
815            if(null != csc) { result.addAll(csc); }
816            return(result);
817            }
818        }