001    /*
002    Copyright (c) 1996-2011, Damon Hart-Davis
003    All rights reserved.
004    
005    Redistribution and use in source and binary forms, with or without
006    modification, are permitted provided that the following conditions are
007    met:
008    
009      * Redistributions of source code must retain the above copyright
010        notice, this list of conditions and the following disclaimer.
011    
012      * Redistributions in binary form must reproduce the above copyright
013        notice, this list of conditions and the following disclaimer in the
014        documentation and/or other materials provided with the
015        distribution.
016    
017    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
018    IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
019    TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
020    PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
021    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
022    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
023    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
024    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
025    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
026    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
027    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
028    */
029    package org.hd.d.pg2k.svrCore;
030    
031    import java.io.IOException;
032    import java.io.InvalidObjectException;
033    import java.io.ObjectInputStream;
034    import java.io.ObjectInputValidation;
035    import java.io.ObjectOutputStream;
036    import java.io.ObjectStreamException;
037    import java.io.PrintStream;
038    import java.io.Serializable;
039    import java.util.AbstractList;
040    import java.util.ArrayList;
041    import java.util.Arrays;
042    import java.util.Collections;
043    import java.util.Comparator;
044    import java.util.HashMap;
045    import java.util.HashSet;
046    import java.util.List;
047    import java.util.RandomAccess;
048    import java.util.SortedSet;
049    import java.util.TreeSet;
050    import java.util.concurrent.ConcurrentHashMap;
051    import java.util.concurrent.ConcurrentMap;
052    import java.util.concurrent.atomic.AtomicInteger;
053    
054    import org.hd.d.pg2k.svrCore.Tuple.Pair;
055    import org.hd.d.pg2k.svrCore.Tuple.Triple;
056    
057    import ORG.hd.d.IsDebug;
058    
059    /**Compact immutable in-memory and on-the-wire representation of a (short, potentially compressible) 7-bit ASCII String.
060     * Converting to and from this form should be relatively resource-light,
061     * eg with low CPU and memory requirements.
062     * <p>
063     * Where there is a conflict between memory consumption and CPU,
064     * this implementation aims to minimise memory use at the possible expense of CPU.
065     * Thus this class is probably inefficient to use as a key in a hashed data structure
066     * where the hash is used heavily,
067     * since we do not cache the hash even though it may be expensive to compute.
068     * <p>
069     * This class is instance-controlled,
070     * and uses intern() to avoid duplicate in-memory copies.
071     * <p>
072     * This cannot hold text longer than Short.MAX_VALUE characters long.
073     * <p>
074     * Internally this uses a byte[] to store the data
075     * (a basic halving of in-memory size in most JVMs)
076     * with some tokenising for further compaction of plain-text for example.
077     * No attempt is made to share state with other instances in memory
078     * (unless a shared StaticDictionary is used).
079     * <p>
080     * An instance of Compact7BitString may serialise as an (immutable) CharSequence rather than as itself
081     * if it seems likely to be more efficient/compressible/safe to do so
082     * (a) for global compressibility (b) for memory efficiency during (de)serialisation
083     * or (c) if the value cannot be encoded other than as a String instance.
084     * <p>
085     * So a coder had better make any serialisable field with a Compact7BitString value
086     * be of type Object or Serializable
087     * to allow it to be read back as either this or a CharSequence.
088     * <p>
089     * This is not publicly constructible,
090     * and may have specialised sub-classes to support some features,
091     * while keeping the base class as small as possible.
092     * <p>
093     * This base class supports all CharSequence methods,
094     * but those that imply random access may be very slow
095     * and take O(n) time where n is the length of the String.
096     */
097    public class Compact7BitString implements Serializable,
098                                                    ObjectInputValidation,
099                                                    MemoryTools.Internable
100        {
101        /**If true, log our encode/decode steps to help debug problems. */
102        private static final boolean VERBOSE_DEBUG = false;
103    
104        /**If true, then log some global stats for tuning.
105         * Should generally be false so as to avoid wasting CPU time and memory.
106         */
107        private static final boolean LOG_STATS = false;
108    
109        /**Immutable static dictionary to improve in-memory compression.
110         * It is not permitted to serialise as-is a Compact7BitString that is using one of these
111         * so as to prevent decompression errors/ambiguities.
112         */
113        public static final class StaticDictionary
114            {
115            /**Create an instance of a static dictionary.
116             *
117             * @param name  dictionary name; must not be null
118             * @param tokens  unique tokens, non-null, best first, each of length &gt; 1,
119             *     non-empty, no longer than MAX_TOKENS
120             */
121            public StaticDictionary(final String name,
122                                    final List<String> tokens)
123                {
124                if(name == null) { throw new IllegalArgumentException(); }
125                this.name = name;
126                if(tokens == null) { throw new IllegalArgumentException(); }
127                // Take a defensive copy.
128                this.tokens = Collections.unmodifiableList(new ArrayList<String>(tokens));
129                // Validate the token set.
130                if(this.tokens.isEmpty() || (this.tokens.size() > MAX_TOKENS))
131                    { throw new IllegalArgumentException("too many/few tokens"); }
132                if(this.tokens.size() != (new HashSet<String>(this.tokens)).size())
133                    { throw new IllegalArgumentException("duplicate token"); }
134                for(final String s : this.tokens)
135                    {
136                    if((s == null) || (s.length() < 2))
137                        { throw new IllegalArgumentException("invalid token (null or too short): " + s); }
138                    if(tokenise(s).size() != 1)
139                        { throw new IllegalArgumentException("invalid token (mixed word and non-word chars): " + s); }
140                    }
141    
142                // Make lookup table from token to virtual "-ve" offsets.
143                // Note that "best" tokens have the smallest offset
144                // so as to be reachable from as far into the input stream as possible.
145                final HashMap<String,Integer> virtualOffsets = new HashMap<String,Integer>(2 * this.tokens.size());
146                // We always start at offset -2 in case use of offset -1 is not possible.
147                int offset = FIRST_VIRTUAL_OFFSET;
148                for(final String t : this.tokens)
149                    {
150                    assert(offset >= Byte.MIN_VALUE);
151                    final Integer oI = Integer.valueOf(offset--);
152                    virtualOffsets.put(t, oI);
153                    }
154                assert(!virtualOffsets.isEmpty());
155                this.virtualOffsets = virtualOffsets;
156                }
157    
158            /**The maximum number of simple tokens for the static dictionary; strictly positive. */
159            public static final int MAX_TOKENS = -Byte.MIN_VALUE - 2;
160    
161            /**First (numerically most positive) "virtual" offset used by the static dictionary; strictly negative.
162             * We start at -2 to avoid conflict with -1 used to escape/start an MTM sequence.
163             */
164            private static final int FIRST_VIRTUAL_OFFSET = -2;
165    
166            /**The name of this dictionary, never null. */
167            public final String name;
168    
169            /**The immutable (non-empty) list of unique, best-first tokens; never null nor empty. */
170            public final List<String> tokens;
171    
172            /**Virtual offsets of tokens in static dictionary; never null nor empty.
173             * Logically immutable.
174             * <p>
175             * Kept private because it is faster to keep it unwrapped and mutable.
176             */
177            private final HashMap<String,Integer> virtualOffsets;
178    
179            /**Human-readable summary: including the dictionary name; never null. */
180            @Override public String toString() { return(name); }
181    
182            /**Only equal if all non-stats fields are. */
183            @Override public boolean equals(final Object obj)
184                {
185                if(this == obj) { return(true); }
186                if(!(obj instanceof StaticDictionary)) { return(false); }
187                final StaticDictionary other = (StaticDictionary) obj;
188                if(!name.equals(other.name)) { return(false); }
189                if(!tokens.equals(other.tokens)) { return(false); }
190                if(!virtualOffsets.equals(other.virtualOffsets)) { return(false); }
191                return(true); // Identical.
192                }
193    
194            /**We don't expect to have to distinguish other than on name in practice, but we use the table size too. */
195            @Override public int hashCode() { return(name.hashCode() + tokens.size()); }
196    
197            /**Thread-safe map from text of uncompressed token (close to start of input) to count and summed-position of the token; non-null iff LOG_STATS is true.
198             * Updated when attempting to compress String data.
199             * <p>
200             * This map is for early tokens of length 2 or more characters,
201             * ie that we might be able to compress better using a static dictionary.
202             * <p>
203             * This is kept for statistical purposes.
204             * <p>
205             * Only the first occurrence of each token in each compress/input is counted.
206             */
207            private final ConcurrentMap<String,Tuple.Pair<AtomicInteger,AtomicInteger>> _uncompEarlyTokens = (!LOG_STATS) ? null :
208                new ConcurrentHashMap<String,Tuple.Pair<AtomicInteger,AtomicInteger>>(1<<16);
209    
210            /**Returns true if stats-collecting is enabled. */
211            public boolean isStatsEnabled() { return(LOG_STATS); }
212    
213            /**Resets the stats (if any) being recorded for this dictionary. */
214            public void resetStats() { if(LOG_STATS) { _uncompEarlyTokens.clear(); } }
215    
216            /**Dump stats (if any) to the given (non-null) stream.
217             * This dumps the stats of compressions against this dictionary instance.
218             *
219             * @param out  never null
220             */
221            public void dumpStats(final PrintStream out)
222                {
223                if(!LOG_STATS) { return; /* No stats to show. */ }
224    
225                // Generate a sorted set of early uncompressable tokens,
226                // sorted by the product of count and token length.
227                // In order these would then be good candidates
228                // for a static compression dictionary for this class.
229                // The set is of <token,count,rounded-up-mean-first-position>.
230                final SortedSet<Tuple.Triple<String,Integer,Integer>> tokens = new TreeSet<Tuple.Triple<String,Integer,Integer>>(new Comparator<Tuple.Triple<String,Integer,Integer>>(){
231                    public int compare(final Triple<String,Integer,Integer> o1, final Triple<String,Integer,Integer> o2)
232                        {
233                        // Sort with higher count/length product first.
234                        // (We use length-1 to reflect number of bytes saved after backref.)
235                        final long l1 = o1.second.longValue() * (o1.first.length() - 1);
236                        final long l2 = o2.second.longValue() * (o2.first.length() - 1);
237                        if(l1 != l2)
238                            {
239                            if(l1 > l2) { return(-1); /* Correct order. */ }
240                            return(1); /* Wrong order. */
241                            }
242    
243                        // We'll break ties on the token value itself
244                        // only so as to guarantee a stable total ordering.
245                        return(o1.first.compareTo(o2.first));
246                        }
247                    });
248    
249                // We rely on the thread-safety
250                // (and immunity from ConcurrentModificationException)
251                // of our iterator over _uncompEarlyTokens.
252                for(final String token : _uncompEarlyTokens.keySet())
253                    {
254                    final Pair<AtomicInteger, AtomicInteger> pair = _uncompEarlyTokens.get(token);
255                    final int count = pair.first.get();
256                    assert(count > 0);
257                    final int summedFirstPos = pair.second.get();
258                    final int rUMFP = (summedFirstPos + count-1) / count;
259                    tokens.add(new Tuple.Triple<String,Integer,Integer>(token,
260                                    Integer.valueOf(count),
261                                    Integer.valueOf(rUMFP)));
262                    }
263    
264                out.println();
265                out.println("DICTIONARY NAME: "+name);
266    
267                // Dump the tokens in order, best first.
268                // We limit this to some initial plausible set of candidates.
269                int shownSoFar = 0;
270                for(final Tuple.Triple<String,Integer,Integer> t : tokens)
271                    {
272                    // Skip rare entries, noting that it takes > 2 to save memory overall.
273                    if(t.second.intValue() < 3) { continue; }
274    
275                    // Skip entries that would be out of reach of their target tokens.
276                    if(t.third.intValue() + shownSoFar > StaticDictionary.FIRST_VIRTUAL_OFFSET + -Byte.MIN_VALUE) { continue; }
277    
278                    // Compute an "escaped" version of the token
279                    // that can go in a String literal/constant.
280                    String s = t.first;
281                    s = s.replace("\\", "\\\\"); // Escape all \ as \\.
282                    s = s.replace("\"", "\\\""); // Replace " with \".
283                    s = s.replace("\r", "\\r");  // Replace CR with \r.
284                    s = s.replace("\n", "\\n");  // Replace LF with \n.
285                    s = s.replace("\t", "\\t");  // Replace TAB with \t.
286                    // Replace all other control characters with octal escapes.
287                    // Note that we cope gracefully with the value expanding while we work.
288                    for(int i = 0; i < s.length(); ++i)
289                        {
290                        final char c = s.charAt(i);
291                        if((c < 32) || (c > 126))
292                            { s.replace(""+c, "\\" + Integer.toOctalString(c)); /* Replace all occurrences of this char at once. */ }
293                        }
294    
295                    out.println("\"" + s + "\",    /* count="+t.second+", saving="+(t.second.intValue()*(t.first.length()-1))+", meanFirstPos="+t.third+" */");
296                    if(++shownSoFar == StaticDictionary.MAX_TOKENS) { break; }
297                    }
298                }
299            }
300    
301        /**Returns true iff this holds an empty string ("").
302         * Can avoid decoding an entire String value just to discover that it is not empty.
303         */
304        public boolean isEmpty() { return(text == null); }
305    
306        /**Convert to full String form. */
307        @Override
308        public String toString()
309            {
310            // Simple empty case.
311            if(text == null) { return(""); }
312    
313            // We accumulate the latest literal bytes of input here
314            // until we can unambiguously extract one or more full tokens from it.
315            // A surprising number of tokens are > 16 chars (the default SB size)
316            // and accumulating untokenised input may thus often force a (slow) resize.
317            final StringBuilder latestInput = new StringBuilder(32);
318    
319            final int len = text.length;
320            final StringBuilder result = new StringBuilder(32 + (len<<3)); // Assume usually <8x compression.
321            final ArrayList<String> tokens = new ArrayList<String>(16 + (len>>1)); // Assume ~2x fewer output tokens than compressed bytes.
322            for(int i = 0; i < len; ++i)
323                {
324                final int ic = text[i];
325    
326                // A positive (7-bit) value is a literal/plain character.
327                // It may or may not be part of a multi-byte token.
328                if(ic >= 0)
329                    {
330    if(VERBOSE_DEBUG) { System.out.println("LITERAL "+ic+((ic >= 32) ? (" '"+((char)ic)+"'") : "")); }
331                    final char c = (char) ic;
332                    result.append(c);
333                    latestInput.append(c);
334                    continue;
335                    }
336    
337                // Tokenise any accumulated literal characters...
338                if(latestInput.length() > 0)
339                    {
340                    final List<String> newTokens = tokenise(latestInput);
341                    // Handle efficiently the common case of a single new token.
342                    if(newTokens.size() == 1) { tokens.add(newTokens.get(0)); }
343                    else { tokens.addAll(newTokens); }
344                    // ...and clear the accumulated input now that it has been tokenised.
345                    latestInput.setLength(0);
346                    }
347    
348                // We never normally expect a direct reference to the previous token,
349                // so this should indicate a multi-token match code.
350                if(ALLOW_MULTI_TOKEN_MATCHING && (ic == -1))
351                    {
352                    if(i == len-1)
353                        { throw new IllegalStateException("illegal multi-token escape at end of data"); }
354    
355                    // We are decoding multiple tokens at once.
356                    // Hold on to your hats...
357                    final int ic2 = text[++i] | ~0xff; // Extend sign from logical 9th bit upwards...
358    
359                    // Extract the encoded quantities...
360                    final int count = (ic2 & ~(~0 << TOKEN_SEQ_LEN_BITS)) + TOKEN_SEQ_LEN_MIN;
361                    final int offset = (ic2 >> TOKEN_SEQ_LEN_BITS) - 1;
362    if(VERBOSE_DEBUG) { System.out.println("DECODED SEQ COUNT, OFFSET, OPC: " + count + ", " + offset); }
363    
364                    // Expand the token sequence...
365                    int index = tokens.size() + offset;
366                    for(int j = count; --j >= 0; )
367                        {
368                        final String token = tokens.get(index++);
369                        result.append(token);
370                        // And note that this is an input token too.
371                        tokens.add(token);
372                        }
373                    continue;
374                    }
375    
376                // Now replace the back-reference with the single token that it indicates...
377                final int index = tokens.size() + ic;
378                final String token;
379                if(index < 0)
380                    {
381                    if((getDict() != null) && (index <= StaticDictionary.FIRST_VIRTUAL_OFFSET))
382                        {
383                        // This may be a reference to a token from the static dictionary.
384                        final int vIndex = StaticDictionary.FIRST_VIRTUAL_OFFSET - index;
385                        if(vIndex >= getDict().tokens.size())
386                            { throw new IllegalStateException("illegal back reference: ic="+ic+", dict.tokens.size()="+getDict().tokens.size()); }
387                        token = getDict().tokens.get(vIndex);
388                        }
389                    else
390                        { throw new IllegalStateException("illegal back reference: ic="+ic+", tokens.size()="+tokens.size()); }
391                    }
392                else
393                    { token = tokens.get(index); }
394    if(VERBOSE_DEBUG) { System.out.println("BACK REF OFFSET "+ic+" = " + token); }
395                result.append(token);
396                // And note that this is an input token too.
397                tokens.add(token);
398                }
399    
400    if(VERBOSE_DEBUG) { System.out.println("--- DECODE DONE"); }
401    
402            return(result.toString());
403            }
404    
405        /**Basic (unshared) representation of 1 byte per character; never empty (is null instead).
406         * Negative values are backward references by the specified number of tokens,
407         * eg -2 means "insert a copy of the token two places back".
408         * <p>
409         * Written out in a customised low-overhead form.
410         */
411        private transient /* final */ byte[] text;
412    //
413    //    /**Static dictionary (if any) used to aid compression; null if none.
414    //     * It is forbidden to serialise an instance where this is non-null
415    //     * so as to prevent decompression errors/ambiguities.
416    //     */
417    //    private transient final StaticDictionary dict;
418    //
419    //  /**Get the static dictionary, null if none. */
420    //  protected StaticDictionary getDict() { return(dict); }
421    
422        /**Get the static dictionary, null if none.
423         * Always null in the base class since we do not carry a dictionary to save space.
424         */
425        protected StaticDictionary getDict() { return(null); }
426    
427        /**Copy of the raw bytes in the internal representation; null if none.
428         * The internal representation is subject to change,
429         * but, with care, this can be used for extra-compact temporary storage.
430         */
431        public byte[] getInternalBytes() { return((text == null) ? null : text.clone()); }
432    
433        /**Make a Compact7BitString from the raw getInternalBytes() value; never null.
434         * This is not guaranteed to work if getInternalBytes() is called in one release
435         * and serialised in another, though this will not be broken casually.
436         * <p>
437         * The result is EMPTY or is intern()ed so as to avoid duplicates.
438         * @param dict  the static dictionary; can be null
439         */
440        public static Compact7BitString makeFromInternalBytes(final byte raw[], final StaticDictionary dict)
441            {
442            return((raw == null) ? EMPTY :
443                ((dict == null) ? MemoryTools.intern(new Compact7BitString(raw)) :
444                                  MemoryTools.intern(new WithDict(raw, dict))));
445            }
446    
447        /**Make simple un-tokenised byte[] representation.
448         * The byte array must be pure 7-bit ASCII and
449         * it must be guaranteed unshared so that we need not copy it for safety.
450         * <p>
451         * The empty string should be represented with a null array to save space,
452         * not an empty array.
453         */
454        private Compact7BitString(final byte text[] /* , final StaticDictionary dict */ )
455            {
456            this.text = text;
457            //this.dict = dict;
458    //if(IsDebug.isDebug) { System.out.println("C7BS length = " + ((text == null) ? "-" : text.length)); }
459    
460            // Verify object state.
461            try { validateObject(); }
462            catch(final InvalidObjectException e)
463                { throw new IllegalArgumentException(e.getMessage(), e); }
464            }
465    
466        /**Equal iff the static dictionaries (if any) and compressed texts are identical.
467         * In principle identical char/byte sequences could be compressed in different ways,
468         * but we only treat as equal items whose internal representations are also identical
469         * which makes equality and hashcode computation relatively fast.
470         * <p>
471         * This is designed to work for specialised sub-classes too.
472         */
473        @Override public boolean equals(final Object obj)
474            {
475            if(this == obj) { return(true); }
476            if(null == obj) { return(false); }
477            // Correctly handle the base and derived/specialised variant classes.
478            if(obj.getClass() != this.getClass()) { return(false); }
479            final Compact7BitString other = (Compact7BitString) obj;
480            // Instances with different dictionaries are considered unequal.
481            final StaticDictionary d = getDict();
482            final StaticDictionary otherD = other.getDict();
483            if(d == null) { if(otherD != null) { return(false); } }
484            else if(!d.equals(otherD)) { return(false); }
485            // Equal if the compressed text is identical.
486            return(Arrays.equals(text, other.text));
487            }
488    
489        /**Maximum number of bytes to compute a full hash over; strictly positive (or -1 to always compute a full hash).
490         * This should probably be large enough to allow full hashing of the bulk of typical texts
491         * (since a good hash over all content tends to perform best over the whole lifecycle)
492         * especially where they are too short for the length itself to be significant hash-bit source.
493         * <p>
494         * Empirically measured to be fastest at 512 or greater compact()ing live AEP as at 20070402.
495         */
496        private final static int MAX_FULL_HASH_LEN = 512;
497    
498        /**Computes the hash code for the text.
499         * The hash is zero for a zero-length (empty) text string.
500         * <p>
501         * By default this hash includes all bytes of the compressed text in the hash,
502         * since that gives best overall performance in hash-based collections.
503         * <p>
504         * However, we cap the time spent generating a hash for very long texts,
505         * hoping to glean some useful full-text information from the compressed length,
506         * and some further useful 'segregation' bits from the dictionary name if any.
507         * <p>
508         * Note that we do NOT cache the computed hash code
509         * since the aim of this representation is to be as compact as possible
510         * and holding an extra word of cached hash would undermine that.
511         */
512        @Override public int hashCode()
513            {
514            // Do the unconditional full hash if not capping hash-computation time.
515            if(MAX_FULL_HASH_LEN < 0) { return(Arrays.hashCode(text)); }
516    
517            if(text == null) { return(0); }
518            final int len = text.length;
519            if(len <= MAX_FULL_HASH_LEN) { return(Arrays.hashCode(text)); }
520            int hash = len;
521            // Hash the text backwards from the end until we hit a back-reference
522            // (after hashing a decent-sized chunk of the tail)
523            // which we hope is implicitly imbued with some information from earlier text.
524            int i = len;
525            // Hash a fixed chunk of the tail.
526            final int tailStop = len - MAX_FULL_HASH_LEN/2;
527            assert(tailStop >= 0);
528            while(--i > tailStop)
529                {
530                // We use a different hashing algorithm than Arrays.hashCode()
531                // to minimise collisions with its hashes of short texts.
532                final int b = text[i];
533                hash += ((b + (hash >>> 24)) ^ (hash << 7));
534                }
535            // Hash a bit more until we find a back-reference...
536            while(--i > 0)
537                {
538                // We use a different hashing algorithm than Arrays.hashCode()
539                // to minimise collisions with its hashes of short texts.
540                final int b = text[i];
541    //            hash = 7 * hash + b;
542                hash += ((b - 1 - (hash >> 21)) ^ (hash << 5));
543                if(b < 0)
544                    {
545    //if(IsDebug.isDebug) { System.err.println("C7BS.hashCode(): stop at "+i+"/"+len); Thread.dumpStack(); }
546                    break;
547                    }
548                }
549            return(hash);
550            }
551    
552        /**Value representing an empty String. */
553        public static final Compact7BitString EMPTY = new Compact7BitString(null);
554    
555        /**Deserialise. */
556        private void readObject(final ObjectInputStream in)
557            throws IOException, ClassNotFoundException
558            {
559            final byte l0 = in.readByte();
560            final int length;
561            if(l0 >= 0) { length = l0; }
562            else
563                {
564                final byte l1 = in.readByte();
565                length = -((l0 << 8) + (l1 & 0xff));
566                }
567            // Create and read the text field.
568            if(length == 0) { text = null; }
569            else
570                {
571                text = new byte[length];
572                in.readFully(text);
573                }
574    
575            // IMPORTANT:
576            // We assume that the constructor will eventually be called
577            // to do validation and any other setup required.
578            assert(getDict() == null); // We must never deserialise with a dictionary.
579            }
580    
581    
582        /**Write out a minimally-redundant form of our internal information.
583         * The more-efficient on-the-wire format also makes defensive
584         * reading easier.
585         * <p>
586         * We don't write *any* default state.
587         * <p>
588         * We must never be asked to serialise if we have a (non-null) dictionary.
589         * <p>
590         * We must never be asked to serialise if the text is too long for us to encode.
591         */
592        private void writeObject(final ObjectOutputStream oos)
593            throws IOException
594            {
595            assert (getDict() == null) : "we must not try to serialise if we are using a dictionary";
596            assert ((text == null) || (text.length <= Short.MAX_VALUE)) : "we must not try to serialise a text too long for us to encode";
597    
598            // Write the length in a variable-length format.
599            //
600            // A null text is a zero-length text.
601            if(null == text) { oos.writeByte(0); }
602            // For very small strings use a byte-value length.
603            else if(text.length <= Byte.MAX_VALUE) { oos.writeByte(text.length); }
604            // Write larger text sizes using a negative length
605            // so that reading the first byte will give a negative value.
606            else { oos.writeShort(-text.length); }
607    
608            // Now write the text bytes directly, unwrapped.
609            if(null != text) { oos.write(text); }
610            }
611    
612        /**Deserialise: use constructor for validation, defensive copying, etc. */
613        private Object readResolve()
614            // throws ObjectStreamException
615            {
616            // As a minor optimisation, use EMPTY for any empty value.
617            if(text == null) { return(EMPTY); }
618    
619            // The text field is assumed to be guaranteed unshared already.
620            return(MemoryTools.intern(new Compact7BitString(text)));
621            }
622    
623        /**Margin that byte[] must be smaller than String.length() for this to be smaller on the wire.
624         * If we get this slightly too large then we will write more Strings than necessary
625         * but may improve compression of multiple instances a little.
626         * <p>
627         * This value represents the overhead of serialising this instance
628         * compared to the special-case String representation
629         * plus some other overheads and potential loss of global compressibility.
630         */
631        private static final int _FUDGE_FACTOR = 64;
632    
633        /**Returns the best format to write the contained text to the wire.
634         * We write ourself to the wire as a java.lang.String
635         * if it appears that that would be more efficient and/or safer.
636         * <p>
637         * We almost always write ourself out as a String
638         * assuming that a good general-purpose compressor such as ZIP
639         * will do better than our memory-efficient form given the original text,
640         * but if we manage to find a lot of compressibility
641         * then we'll save ourself directly.
642         * <p>
643         * If a static dictionary is in use then we force serialisation as a String
644         * for safety (to avoid ambiguity).
645         */
646        protected Object writeReplace()
647            throws ObjectStreamException
648            {
649            // A zero-length text is best represented as the (literal, intern()ed) empty String.
650            if(text == null) { return(""); }
651    
652            // If a static dictionary has been set, or
653            // if we didn't manage to super-compress the input text
654            // or if the compressed text is too long to represent on the wire,
655            // then serialise this instance as an intern()ed CS8Bit rather than itself
656            // since that is probably near-optimal (in stream UTF-8 format) already,
657            // especially when multiple text instances are compressed in one stream
658            // with a good general-purpose compressor.
659            // CS8 is used for its predictable serialisation behaviour (eg Name might drag in other stuff).
660            final String s = toString();
661            final int sLen = s.length();
662            if((null != getDict()) ||
663               (text.length > Short.MAX_VALUE) ||
664               (text.length + _FUDGE_FACTOR >= sLen) ||
665               (text.length > (sLen>>>4)))
666                { return(MemoryTools.intern(new CS8Bit(s))); }
667    
668    if(IsDebug.isDebug) { System.out.println("[C7BS: serialising as self: text.length="+text.length+", toString().length()="+toString().length()+".]"); }
669    
670            // Write this instance out as itself.
671            return(this);
672            }
673    
674        /**Validate fields/state.
675         * Called in the constructor and possibly after de-serialising.
676         * <p>
677         * Barf if something bad is found.
678         * (Maybe allow some extra info in debug version.)
679         */
680        public void validateObject()
681            throws InvalidObjectException
682            {
683            // We should use a null array rather than an empty one.
684            if((text != null) && (text.length == 0))
685                { throw new InvalidObjectException("bad object: text.length == 0"); }
686            if((text != null) && (text.length > Short.MAX_VALUE))
687                { throw new InvalidObjectException("bad object: text.length > Short.MAX_VALUE"); }
688    
689            // TODO: validate content
690            }
691    
692        /**Unique serialisation ID. */
693        private static final long serialVersionUID = -5836076660507008468L;
694    
695        /**Our tokenising pattern; never null.
696         * This accepts either a "word" or a non-word,
697         * the idea is to compress an alternating sequence of such non-empty tokens.
698         * <p>
699         * The word characters are [a-zA-Z_0-9].
700         * The non-word characters are everything else in the [0,127] 7-bit-ASCII alphabet.
701         * <p>
702         * These tokens need not be strictly alternating
703         * provided that ambiguity can be avoided.
704         * <p>
705         * These tokens are currently not limited in length.
706         * <p>
707         * Inspired by the HuffWord compression tokeniser from "Managing Gigabytes".
708         */
709    //    private static final Pattern tokeniser = Pattern.compile("([\\w]+)|([\\W]+)");
710    
711    //    /**If true, use regex-based tokenisation, else use hand-crafted solution. */
712    //    private static final boolean USE_REGEX_TOKENISER = false;
713    
714        /**Boolean true/false flags for what are considered "word" chars; initialised on first use.
715         * Also holds intern()ed one-byte Strings for each token
716         * to avoid run-time allocation similar in intent to Integer.valueOf(int).
717         * <p>
718         * The word characters are [a-zA-Z_0-9].
719         * The non-word characters are everything else in the [0,127] 7-bit-ASCII alphabet.
720         * <p>
721         * These tokens need not be strictly alternating
722         * provided that ambiguity can be avoided.
723         * <p>
724         * These tokens are currently not limited in length.
725         * <p>
726         * Inspired by the HuffWord compression tokeniser from "Managing Gigabytes".
727         */
728        private static final class WordChars
729            {
730            private WordChars(){} // Prevent construction.
731    
732            /**An entry is true if the corresponding ASCII char of the index is a "word" char. */
733            static final boolean isWordChar[] = new boolean[Byte.MAX_VALUE+1];
734            static
735                {
736                Arrays.fill(isWordChar, '0', '9'+1, true);
737                Arrays.fill(isWordChar, 'A', 'Z'+1, true);
738                Arrays.fill(isWordChar, 'a', 'z'+1, true);
739                isWordChar['_'] = true;
740                }
741    
742            /**One-byte intern()ed String for this char. */
743            static final String s[] = new String[Byte.MAX_VALUE+1];
744            static
745                {
746                for(int i = Byte.MAX_VALUE+1; --i >= 0; )
747                    { s[i] = String.valueOf((char) i).intern(); }
748                }
749            }
750    
751        /**Routine to chop the (non-empty) input CharSequence as an in-order List of String tokens; result may be empty but is never null.
752         * The toString() method may be used to convert all if the input (or subsequences) to a String,
753         * so must behave well.
754         * <p>
755         * The returned list has fast random access, ie with get(index).
756         */
757        private static List<String> tokenise(final CharSequence in)
758            {
759            final int len = in.length();
760            // Handle small inputs specially:
761            //   * Empty input is (would be) zero tokens.
762            //   * Single byte input is a single token.
763            if(len < 2)
764                {
765                assert(len == 1);
766    //            if(len == 0)
767    //                {
768    //                final List<String> result = Collections.emptyList();
769    //                return(result);
770    //                }
771                final List<String> result = Collections.singletonList(in.toString());
772                return(result);
773                }
774    
775            final ArrayList<String> tokens = new ArrayList<String>(2 + (len>>2));
776    
777            int currentTokenStart = 0; // Start of current token...
778            boolean currentTokenIsWord = WordChars.isWordChar[in.charAt(0)];
779            for(int i = 0; ++i < len; )
780                {
781                final boolean thisIsWordChar = WordChars.isWordChar[in.charAt(i)];
782                if(thisIsWordChar != currentTokenIsWord)
783                    {
784                    // We just started a new token.
785                    // Handle common single-character token case efficiently (no allocation).
786                    if(i == currentTokenStart+1)
787                        { tokens.add(WordChars.s[in.charAt(currentTokenStart)]); }
788                    else // Generic (length != 1) case.
789                        { tokens.add(in.subSequence(currentTokenStart, i).toString()); }
790                    // Note details of new token.
791                    currentTokenStart = i;
792                    currentTokenIsWord = thisIsWordChar;
793                    }
794                }
795            // Create/add the trailing token,
796            // making sure that the common single-char case is efficient.
797            if(len == currentTokenStart+1)
798                { tokens.add(WordChars.s[in.charAt(currentTokenStart)]); }
799            else // Generic (length != 1) case.
800                { tokens.add(in.subSequence(currentTokenStart, len).toString()); }
801            return(tokens);
802            }
803    
804        /**If true, then don't try to match the immediately-previous input token.
805         * This is usually not going to be effective
806         * as unless we split tokens then we can never match the previous value
807         * as we alternate between word and non-word.
808         * So not trying to match saves a little bit of time.
809         * <p>
810         * This also frees up the "-1" offset value as an escape,
811         * eg for multi-token-match expansion.
812         */
813        private static final boolean DONT_TRY_TO_MATCH_PREV_TOKEN = true;
814    
815        /**If true then allow multi-token matches (MTM) for better compression of very repetitive text.
816         * This is for where a number of tokens in sequence are identical
817         * to some previous (near) token sequence.
818         */
819        private static final boolean ALLOW_MULTI_TOKEN_MATCHING = DONT_TRY_TO_MATCH_PREV_TOKEN;
820    
821        /**If true, allow us to do include 1-char tokens in multi-token matches (MTMs).
822         * This should help maximise the compression from multi-token matches,
823         * since many possible multi-token sequences will start with
824         * a single-character token, such as " ".
825         */
826        private static final boolean ALLOW_1_CHAR_MTM = true;
827    
828        /**If true, allow partial (prefix) token matching for long (semi-)unique tokens.
829         * Can be slow, but allows a little more compression.
830         */
831        private static final boolean ALLOW_PARTIAL_TOKEN_MATCHING = true;
832    
833        /**Bits that we use to represent a token sequence length; strictly positive and less than 8. */
834        private static final int TOKEN_SEQ_LEN_BITS = 3;
835    
836        /**Minimum number of sequential tokens that we can encode as a sequence; strictly positive.
837         * Since the minimum encoding is usually 2 bytes
838         * then we set a minimum sequence length longer than this
839         * to guarantee to do better than just a sequence of separate back-references.
840         */
841        private static final int TOKEN_SEQ_LEN_MIN = 3;
842    
843        /**Maximum number of sequential tokens that we can encode as a sequence; strictly positive.
844         * Note that we store the sequence length - 2,
845         * since we would use the normal back-reference for a single-token match,
846         * so we use a zero value to indicate a length-of-two sequence.
847         */
848        private static final int TOKEN_SEQ_LEN_MAX = (1 << TOKEN_SEQ_LEN_BITS) - 1 + TOKEN_SEQ_LEN_MIN;
849    
850        /**Maximally-negative offset that we can hold above the sequence length in one byte; strictly negative.
851         * We have an implict logical 9th sign bit, always 1 (ie negative).
852         */
853        private static final int TOKEN_SEQ_OFFSET_MIN = (~0xff) >> TOKEN_SEQ_LEN_BITS; // Use sign-preserving right shift.
854    
855    
856        /**Specialised immutable sub-class to allow use with a StaticDictionary.
857         * Not directly visible outside this class,
858         * but is constructed by convertToCompact7BitString() as necessary.
859         * <p>
860         * Note that this can never be directly (de)serialised
861         * since this forces itself to String form when serialisation is attempted.
862         * This is because we cannot efficiently serialise the dictionary
863         * and thus would not have enough information to ensure correct decoding.
864         */
865        private final static class WithDict extends Compact7BitString
866            {
867            private WithDict(final byte text[], final StaticDictionary dict)
868                {
869                super(text);
870                if(dict == null)
871                    { throw new IllegalArgumentException("bad object: null dictionary"); }
872                this.dict = dict;
873                }
874    
875            /**Always write as (ie return) a String; never null. */
876            @Override
877            protected Object writeReplace()
878                throws ObjectStreamException
879                {
880                assert(dict != null); // We must write as a String, given that we have a dictionary.
881                return(MemoryTools.intern(toString()));
882                }
883    
884            /**Static dictionary (if any) used to aid compression; never null.
885             * It is forbidden to serialise an instance as-is
886             * so as to prevent decompression errors/ambiguities,
887             * so these sub-classes as always serialised as String instances.
888             */
889            private final StaticDictionary dict;
890    
891            /**Get the static dictionary, null if none. */
892            @Override
893            protected StaticDictionary getDict() { return(dict); }
894    
895            /**Cache of the hash code for speed.
896             * Since we've blown some extra space on the (reference) to the dictionary,
897             * we'll blow a little more on a cache since hash computation can be expensive.
898             * <p>
899             * We assume that it has not been computed and cached if zero.
900             */
901            private int _hash;
902    
903            /**Derive (and cache) a hash based on the text and the dictionary.
904             * This adds in some distinguishing sauce from the dictionary
905             * to better spread out hash codes from C7BS texts.
906             */
907            @Override
908            public int hashCode()
909                {
910                int h = _hash;
911                if(h == 0)
912                    {
913                    // Compute and cache the hash derived from that of the base class
914                    // with some bits from the dictionary.
915                    h = super.hashCode() - dict.name.hashCode();
916                    _hash = h; // Cache the hash for next time.
917                    }
918                return(h);
919                }
920    
921            /**Serialisation UID; never really used. */
922            private static final long serialVersionUID = -7930788498700479835L;
923            }
924    
925        /**Losslessly convert a String entirely consisting of 7-bit ASCII text to a tokenised form for compression/serialisation; not null unless the input is null.
926         * This should be especially effective for XML, but also OK for general text.
927         * <p>
928         * This is designed to be relatively fast and consume little memory.
929         * <p>
930         * Use of a good shared static dictionary between multiple instances
931         * may significantly reduce memory consumption,
932         * but it is not permitted to serialise instances using such a dictionary
933         * other than as a String value.
934         * <p>
935         * This does NOT intern() the result since it would not be appropriate in all cases
936         * (and could add significant CPU and memory overhead).
937         *
938         * @param s  the 7-bit ASCII text to encode
939         * @param dict  static compression dictionary to use, or null if none
940         *
941         * @throws  IllegalArgumentException  if the argument is not 7-bit ASCII
942         *     ie contains characters outside the range 0--127
943         */
944        public static final Compact7BitString convertToCompact7BitString(final CharSequence s,
945                                                                         final StaticDictionary dict)
946            {
947            if(s == null) { return(null); }
948    
949            // In the simple, zero-length, case we return the static/shared EMPTY value.
950            final int length = s.length();
951            if(length == 0) { return(EMPTY); }
952    
953            // Verify that the input is pure 7-bit.
954            // Veto conversion with IllegalArgumentException if not.
955            for(int i = length; --i >= 0; )
956                {
957                final char c = s.charAt(i);
958                if(c > 127) { throw new IllegalArgumentException("not pure 7-bit ASCII text"); }
959                }
960    
961            // Tokenise the input.
962            final List<String> tokensRaw = tokenise(s);
963            assert(tokensRaw instanceof RandomAccess); // We need fast random access into the list.
964            final List<String> tokens;
965            // Without a dictionary we use the tokens list from the tokeniser as-is.
966            if(dict == null)
967                { tokens = tokensRaw; }
968            // With a dictionary, we allow -ve get()s into the static tokens.
969            else
970                {
971                final class VirtualStringList extends AbstractList<String> implements RandomAccess
972                    {
973                    @Override public final String get(final int index)
974                        {
975                        // In the simple +ve index case, fetch from the underlying List.
976                        if(index >= 0) { return(tokensRaw.get(index)); }
977                        // Deal with -ve references into the static dictionary.
978                        if(index <= StaticDictionary.FIRST_VIRTUAL_OFFSET)
979                            {
980                            final int vIndex = StaticDictionary.FIRST_VIRTUAL_OFFSET - index;
981                            if(vIndex < dict.tokens.size())
982                                { return(dict.tokens.get(vIndex)); }
983                            throw new IndexOutOfBoundsException("illegal back reference: " + index);
984                            }
985                        return(""); // Deal with "gap" at -1 by returning an impossible token...
986                        }
987    
988                    @Override public final void add(final int index, final String element)
989                        { tokensRaw.add(index, element); }
990    
991                    @Override public final String set(final int index, final String element)
992                        { return(tokensRaw.set(index, element)); }
993    
994                    /**Size is reported as that of the underlying "raw" List; -ve indexed items are not counted. */
995                    @Override public final int size() { return(tokensRaw.size()); }
996                    }
997                tokens = new VirtualStringList();
998                }
999    
1000            // Map of all tokens seen so far to last position they were seen in.
1001            // All tokens here will be >1 char, unless ALLOW_1_CHAR_MTM is true.
1002            final HashMap<String,Integer> tokensSeenSoFar = new HashMap<String,Integer>(256 + 2*tokens.size());
1003    
1004            // If we have a static dictionary then insert its tokens now.
1005            if(dict != null) { tokensSeenSoFar.putAll(dict.virtualOffsets); }
1006    
1007            // Generate the output.
1008            final StringBuilder sb = new StringBuilder(s.length());
1009            main: for(int i = 0; i < tokens.size(); ++i)
1010                {
1011                final String token = tokens.get(i);
1012                final int tokenLength = token.length();
1013                assert(tokenLength > 0);
1014    
1015                // We keep this single-character token as-is
1016                // (ie as an uncompressed literal byte)
1017                // unless we are potentially allowing this as
1018                // the start of a multi-token match.
1019                if(!ALLOW_1_CHAR_MTM && (tokenLength == 1))
1020                    {
1021    if(VERBOSE_DEBUG) { System.out.println("LITERAL "+((int) token.charAt(0))); }
1022                    tokensSeenSoFar.put(token, Integer.valueOf(i));
1023                    sb.append(token);
1024                    continue;
1025                    }
1026    
1027                // Current input token-list size.
1028                final int nTokensNow = tokens.size();
1029    
1030                // Look for the nearest match backwards in the valid match window.
1031                // But only look if we know that the token has already been seen so far.
1032                final Integer lastPos = tokensSeenSoFar.get(token);
1033                if(lastPos != null)
1034                    {
1035                    final int lp = lastPos.intValue();
1036                    assert(token.equals(tokens.get(lp)));
1037                    final int offset = lp - i;
1038                    // Ignore matches that we cannot use.
1039                    if((offset >= Byte.MIN_VALUE) &&
1040                       (!(DONT_TRY_TO_MATCH_PREV_TOKEN && (offset == -1))))
1041                        {
1042                        assert(token.equals(tokens.get(i+offset)));
1043    
1044                        if(ALLOW_MULTI_TOKEN_MATCHING)
1045                            {
1046                            // Look FORWARD and count how many sequential tokens match from here
1047                            // (from index j inclusive up to index i exclusive)
1048                            // which represent tokens already encoded/sent.
1049                            // We may then be able to encode them all in one fell swoop...
1050                            int count = 1;
1051                            final int endStop = nTokensNow-1 + offset; // Can't search beyond end of the input!
1052                            for(int k = lp; ++k <= endStop; )
1053                                {
1054                                // Stop if the sequence stops matching...
1055                                if(!tokens.get(k).equals(tokens.get(k-offset)))
1056                                    { break; }
1057                                // Stop if we reach the maximum sequence length that we can handle.
1058                                if(++count == TOKEN_SEQ_LEN_MAX)
1059                                    { break; }
1060                                }
1061                            assert((count > 0) && (count <= TOKEN_SEQ_LEN_MAX));
1062                            // Handle a long-enough-to-be-worthwhile sequence,
1063                            // but only if it is within range...
1064                            if(count >= TOKEN_SEQ_LEN_MIN)
1065                                {
1066                                final int normalisedOffset = offset + 1; // Offset of -1 is never used.
1067                                if(normalisedOffset >= TOKEN_SEQ_OFFSET_MIN)
1068                                    {
1069    if(VERBOSE_DEBUG) { System.out.println("ENCODABLE SEQ COUNT, OFFSET, SEQ: " + count + ", " + offset + ", " + tokens.subList(i, i+count)); }
1070                                    sb.append((char) -1); // Escape code.
1071                                    final int normalisedCount = (count - TOKEN_SEQ_LEN_MIN);
1072                                    assert(normalisedCount < (1 << TOKEN_SEQ_LEN_BITS));
1073                                    final int encoded = (normalisedOffset << TOKEN_SEQ_LEN_BITS) + normalisedCount;
1074                                    sb.append((char) encoded); // Only the bottom byte is important.
1075    
1076                                    // Note the first token processed.
1077                                    tokensSeenSoFar.put(token, Integer.valueOf(i));
1078    
1079                                    // Skip an extra count-1 tokens that we're encoding...
1080                                    // Note all the (> length 1) tokens that we're skipping...
1081                                    for(int k = count; --k > 0; )
1082                                        {
1083                                        final String skippedToken = tokens.get(++i);
1084                                        if(ALLOW_1_CHAR_MTM || (skippedToken.length() > 1))
1085                                            { tokensSeenSoFar.put(skippedToken, Integer.valueOf(i)); }
1086                                        }
1087    
1088                                    continue main;
1089                                    }
1090                               }
1091                            // Fall through to write normal single-token back-reference.
1092    
1093                            // If the token length is 1
1094                            // then we prefer to write the literal over the back-ref
1095                            // though it does not matter in fact.
1096                            if(tokenLength == 1)
1097                                {
1098    if(VERBOSE_DEBUG) { System.out.println("LITERAL "+((int) token.charAt(0))); }
1099                                tokensSeenSoFar.put(token, Integer.valueOf(i));
1100                                sb.append(token);
1101                                continue;
1102                                }
1103    
1104                            // We have a match to a single previous nearby token...
1105                            // So write a single-byte relative back-reference rather than the token.
1106    if(VERBOSE_DEBUG) { System.out.println("BACK REF OFFSET "+offset+" = " + token); }
1107                            tokensSeenSoFar.put(token, Integer.valueOf(i));
1108                            sb.append((char) offset); // Only care about bottom byte...
1109                            continue main;
1110                            }
1111                        }
1112                    }
1113    
1114                // If we could not find a suitable full-token match
1115                // then we may be able to find a partial match instead
1116                // (providing that this token is long enough to split in two).
1117                if(ALLOW_PARTIAL_TOKEN_MATCHING && (tokenLength > 2))
1118                    {
1119                    // We prefer to match the longest-possible prefix
1120                    // to try to maximise compression.
1121                    for(int prefixLength = tokenLength; --prefixLength > 1; )
1122                        {
1123                        final String prefix = token.substring(0, prefixLength);
1124                        final Integer lastPrefixPos = tokensSeenSoFar.get(prefix);
1125                        if(lastPrefixPos == null) { continue; }
1126                        final int lpp = lastPrefixPos.intValue();
1127                        assert(prefix.equals(tokens.get(lpp)));
1128                        final int offset = lpp - i;
1129                        // Skip matches that we cannot use.
1130                        if(offset < Byte.MIN_VALUE) { continue; }
1131                        if(DONT_TRY_TO_MATCH_PREV_TOKEN && (offset == -1)) { continue; }
1132                        assert(prefix.equals(tokens.get(i+offset)));
1133                        // We veto this prefix match (and of all other prefixes of this token)
1134                        // if the whole token may be the target of a back-reference later.
1135                        if(tokens.subList(i+1, Math.min(nTokensNow, i-Byte.MIN_VALUE)).contains(token)) { break; }
1136    //System.out.println("Prefix match allowed, length: " + prefix.length());
1137                        // Adjust tokens list with new split token.
1138                        tokens.set(i, prefix);
1139                        tokens.add(i+1, token.substring(prefixLength));
1140    if(VERBOSE_DEBUG) { System.out.println("BACK (PREFIX) REF OFFSET "+offset+" = " + token); }
1141                        tokensSeenSoFar.put(prefix, Integer.valueOf(i));
1142                        sb.append((char) offset); // Only care about bottom byte...
1143                        continue main;
1144                        }
1145                    }
1146    
1147                // Log this uncompressed >1-char early token
1148                // if the first occurrence for this compression/input
1149                // if compiling global stats
1150                // and if this is close to the start of the input.
1151                if(LOG_STATS && (dict != null) &&
1152                    (tokenLength > 1) && (i < -Byte.MIN_VALUE) &&
1153                    (null == tokensSeenSoFar.get(token)))
1154                    {
1155                    // Create a counter for this token if need be.
1156                    dict._uncompEarlyTokens.putIfAbsent(token, new Tuple.Pair<AtomicInteger,AtomicInteger>(new AtomicInteger(),new AtomicInteger()));
1157                    // Atomically increment the counter for this token.
1158                    final Pair<AtomicInteger, AtomicInteger> pair = dict._uncompEarlyTokens.get(token);
1159                    final int tc = pair.first.incrementAndGet();
1160                    // Update the summed earliest-position value...
1161                    pair.second.addAndGet(i);
1162    if(VERBOSE_DEBUG && ((tc % 1000) == 0)) { System.out.println("[Token count for '"+token+"': "+tc+".]"); }
1163                    }
1164    
1165                // Now now update tokensSeenSoFar with this token.
1166                tokensSeenSoFar.put(token, Integer.valueOf(i));
1167                // Did not find a (near enough) match, so write token as-is.
1168                sb.append(token);
1169                }
1170    //System.out.println("[Length in (chars) / out (bytes): "+s.length()+"/"+sb.length()+".]");
1171    
1172            // Convert back to byte[].
1173            final byte[] text = new byte[sb.length()];
1174            for(int i = text.length; --i >= 0; )
1175                { text[i] = (byte) sb.charAt(i); }
1176    
1177    if(VERBOSE_DEBUG) { System.out.println("--- ENCODE DONE"); }
1178            final Compact7BitString result = (dict == null) ?
1179                new Compact7BitString(text) : new WithDict(text, dict);
1180    //        assert(s.equals(result.toString())) : "Generated bad compressed form: '"+result+"' from input: '"+s+"'";
1181            // DO NOT intern(): allow the caller to decide if that is appropriate.
1182            return(result);
1183            }
1184        }