001 /*
002 Copyright (c) 1996-2011, Damon Hart-Davis
003 All rights reserved.
004
005 Redistribution and use in source and binary forms, with or without
006 modification, are permitted provided that the following conditions are
007 met:
008
009 * Redistributions of source code must retain the above copyright
010 notice, this list of conditions and the following disclaimer.
011
012 * Redistributions in binary form must reproduce the above copyright
013 notice, this list of conditions and the following disclaimer in the
014 documentation and/or other materials provided with the
015 distribution.
016
017 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
018 IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
019 TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
020 PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
021 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
022 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
023 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
024 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
025 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
026 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
027 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
028 */
029 package org.hd.d.pg2k.svrCore;
030
031 import java.io.IOException;
032 import java.io.InvalidObjectException;
033 import java.io.ObjectInputStream;
034 import java.io.ObjectInputValidation;
035 import java.io.ObjectOutputStream;
036 import java.io.ObjectStreamException;
037 import java.io.PrintStream;
038 import java.io.Serializable;
039 import java.util.AbstractList;
040 import java.util.ArrayList;
041 import java.util.Arrays;
042 import java.util.Collections;
043 import java.util.Comparator;
044 import java.util.HashMap;
045 import java.util.HashSet;
046 import java.util.List;
047 import java.util.RandomAccess;
048 import java.util.SortedSet;
049 import java.util.TreeSet;
050 import java.util.concurrent.ConcurrentHashMap;
051 import java.util.concurrent.ConcurrentMap;
052 import java.util.concurrent.atomic.AtomicInteger;
053
054 import org.hd.d.pg2k.svrCore.Tuple.Pair;
055 import org.hd.d.pg2k.svrCore.Tuple.Triple;
056
057 import ORG.hd.d.IsDebug;
058
059 /**Compact immutable in-memory and on-the-wire representation of a (short, potentially compressible) 7-bit ASCII String.
060 * Converting to and from this form should be relatively resource-light,
061 * eg with low CPU and memory requirements.
062 * <p>
063 * Where there is a conflict between memory consumption and CPU,
064 * this implementation aims to minimise memory use at the possible expense of CPU.
065 * Thus this class is probably inefficient to use as a key in a hashed data structure
066 * where the hash is used heavily,
067 * since we do not cache the hash even though it may be expensive to compute.
068 * <p>
069 * This class is instance-controlled,
070 * and uses intern() to avoid duplicate in-memory copies.
071 * <p>
072 * This cannot hold text longer than Short.MAX_VALUE characters long.
073 * <p>
074 * Internally this uses a byte[] to store the data
075 * (a basic halving of in-memory size in most JVMs)
076 * with some tokenising for further compaction of plain-text for example.
077 * No attempt is made to share state with other instances in memory
078 * (unless a shared StaticDictionary is used).
079 * <p>
080 * An instance of Compact7BitString may serialise as an (immutable) CharSequence rather than as itself
081 * if it seems likely to be more efficient/compressible/safe to do so
082 * (a) for global compressibility (b) for memory efficiency during (de)serialisation
083 * or (c) if the value cannot be encoded other than as a String instance.
084 * <p>
085 * So a coder had better make any serialisable field with a Compact7BitString value
086 * be of type Object or Serializable
087 * to allow it to be read back as either this or a CharSequence.
088 * <p>
089 * This is not publicly constructible,
090 * and may have specialised sub-classes to support some features,
091 * while keeping the base class as small as possible.
092 * <p>
093 * This base class supports all CharSequence methods,
094 * but those that imply random access may be very slow
095 * and take O(n) time where n is the length of the String.
096 */
097 public class Compact7BitString implements Serializable,
098 ObjectInputValidation,
099 MemoryTools.Internable
100 {
101 /**If true, log our encode/decode steps to help debug problems. */
102 private static final boolean VERBOSE_DEBUG = false;
103
104 /**If true, then log some global stats for tuning.
105 * Should generally be false so as to avoid wasting CPU time and memory.
106 */
107 private static final boolean LOG_STATS = false;
108
109 /**Immutable static dictionary to improve in-memory compression.
110 * It is not permitted to serialise as-is a Compact7BitString that is using one of these
111 * so as to prevent decompression errors/ambiguities.
112 */
113 public static final class StaticDictionary
114 {
115 /**Create an instance of a static dictionary.
116 *
117 * @param name dictionary name; must not be null
118 * @param tokens unique tokens, non-null, best first, each of length > 1,
119 * non-empty, no longer than MAX_TOKENS
120 */
121 public StaticDictionary(final String name,
122 final List<String> tokens)
123 {
124 if(name == null) { throw new IllegalArgumentException(); }
125 this.name = name;
126 if(tokens == null) { throw new IllegalArgumentException(); }
127 // Take a defensive copy.
128 this.tokens = Collections.unmodifiableList(new ArrayList<String>(tokens));
129 // Validate the token set.
130 if(this.tokens.isEmpty() || (this.tokens.size() > MAX_TOKENS))
131 { throw new IllegalArgumentException("too many/few tokens"); }
132 if(this.tokens.size() != (new HashSet<String>(this.tokens)).size())
133 { throw new IllegalArgumentException("duplicate token"); }
134 for(final String s : this.tokens)
135 {
136 if((s == null) || (s.length() < 2))
137 { throw new IllegalArgumentException("invalid token (null or too short): " + s); }
138 if(tokenise(s).size() != 1)
139 { throw new IllegalArgumentException("invalid token (mixed word and non-word chars): " + s); }
140 }
141
142 // Make lookup table from token to virtual "-ve" offsets.
143 // Note that "best" tokens have the smallest offset
144 // so as to be reachable from as far into the input stream as possible.
145 final HashMap<String,Integer> virtualOffsets = new HashMap<String,Integer>(2 * this.tokens.size());
146 // We always start at offset -2 in case use of offset -1 is not possible.
147 int offset = FIRST_VIRTUAL_OFFSET;
148 for(final String t : this.tokens)
149 {
150 assert(offset >= Byte.MIN_VALUE);
151 final Integer oI = Integer.valueOf(offset--);
152 virtualOffsets.put(t, oI);
153 }
154 assert(!virtualOffsets.isEmpty());
155 this.virtualOffsets = virtualOffsets;
156 }
157
158 /**The maximum number of simple tokens for the static dictionary; strictly positive. */
159 public static final int MAX_TOKENS = -Byte.MIN_VALUE - 2;
160
161 /**First (numerically most positive) "virtual" offset used by the static dictionary; strictly negative.
162 * We start at -2 to avoid conflict with -1 used to escape/start an MTM sequence.
163 */
164 private static final int FIRST_VIRTUAL_OFFSET = -2;
165
166 /**The name of this dictionary, never null. */
167 public final String name;
168
169 /**The immutable (non-empty) list of unique, best-first tokens; never null nor empty. */
170 public final List<String> tokens;
171
172 /**Virtual offsets of tokens in static dictionary; never null nor empty.
173 * Logically immutable.
174 * <p>
175 * Kept private because it is faster to keep it unwrapped and mutable.
176 */
177 private final HashMap<String,Integer> virtualOffsets;
178
179 /**Human-readable summary: including the dictionary name; never null. */
180 @Override public String toString() { return(name); }
181
182 /**Only equal if all non-stats fields are. */
183 @Override public boolean equals(final Object obj)
184 {
185 if(this == obj) { return(true); }
186 if(!(obj instanceof StaticDictionary)) { return(false); }
187 final StaticDictionary other = (StaticDictionary) obj;
188 if(!name.equals(other.name)) { return(false); }
189 if(!tokens.equals(other.tokens)) { return(false); }
190 if(!virtualOffsets.equals(other.virtualOffsets)) { return(false); }
191 return(true); // Identical.
192 }
193
194 /**We don't expect to have to distinguish other than on name in practice, but we use the table size too. */
195 @Override public int hashCode() { return(name.hashCode() + tokens.size()); }
196
197 /**Thread-safe map from text of uncompressed token (close to start of input) to count and summed-position of the token; non-null iff LOG_STATS is true.
198 * Updated when attempting to compress String data.
199 * <p>
200 * This map is for early tokens of length 2 or more characters,
201 * ie that we might be able to compress better using a static dictionary.
202 * <p>
203 * This is kept for statistical purposes.
204 * <p>
205 * Only the first occurrence of each token in each compress/input is counted.
206 */
207 private final ConcurrentMap<String,Tuple.Pair<AtomicInteger,AtomicInteger>> _uncompEarlyTokens = (!LOG_STATS) ? null :
208 new ConcurrentHashMap<String,Tuple.Pair<AtomicInteger,AtomicInteger>>(1<<16);
209
210 /**Returns true if stats-collecting is enabled. */
211 public boolean isStatsEnabled() { return(LOG_STATS); }
212
213 /**Resets the stats (if any) being recorded for this dictionary. */
214 public void resetStats() { if(LOG_STATS) { _uncompEarlyTokens.clear(); } }
215
216 /**Dump stats (if any) to the given (non-null) stream.
217 * This dumps the stats of compressions against this dictionary instance.
218 *
219 * @param out never null
220 */
221 public void dumpStats(final PrintStream out)
222 {
223 if(!LOG_STATS) { return; /* No stats to show. */ }
224
225 // Generate a sorted set of early uncompressable tokens,
226 // sorted by the product of count and token length.
227 // In order these would then be good candidates
228 // for a static compression dictionary for this class.
229 // The set is of <token,count,rounded-up-mean-first-position>.
230 final SortedSet<Tuple.Triple<String,Integer,Integer>> tokens = new TreeSet<Tuple.Triple<String,Integer,Integer>>(new Comparator<Tuple.Triple<String,Integer,Integer>>(){
231 public int compare(final Triple<String,Integer,Integer> o1, final Triple<String,Integer,Integer> o2)
232 {
233 // Sort with higher count/length product first.
234 // (We use length-1 to reflect number of bytes saved after backref.)
235 final long l1 = o1.second.longValue() * (o1.first.length() - 1);
236 final long l2 = o2.second.longValue() * (o2.first.length() - 1);
237 if(l1 != l2)
238 {
239 if(l1 > l2) { return(-1); /* Correct order. */ }
240 return(1); /* Wrong order. */
241 }
242
243 // We'll break ties on the token value itself
244 // only so as to guarantee a stable total ordering.
245 return(o1.first.compareTo(o2.first));
246 }
247 });
248
249 // We rely on the thread-safety
250 // (and immunity from ConcurrentModificationException)
251 // of our iterator over _uncompEarlyTokens.
252 for(final String token : _uncompEarlyTokens.keySet())
253 {
254 final Pair<AtomicInteger, AtomicInteger> pair = _uncompEarlyTokens.get(token);
255 final int count = pair.first.get();
256 assert(count > 0);
257 final int summedFirstPos = pair.second.get();
258 final int rUMFP = (summedFirstPos + count-1) / count;
259 tokens.add(new Tuple.Triple<String,Integer,Integer>(token,
260 Integer.valueOf(count),
261 Integer.valueOf(rUMFP)));
262 }
263
264 out.println();
265 out.println("DICTIONARY NAME: "+name);
266
267 // Dump the tokens in order, best first.
268 // We limit this to some initial plausible set of candidates.
269 int shownSoFar = 0;
270 for(final Tuple.Triple<String,Integer,Integer> t : tokens)
271 {
272 // Skip rare entries, noting that it takes > 2 to save memory overall.
273 if(t.second.intValue() < 3) { continue; }
274
275 // Skip entries that would be out of reach of their target tokens.
276 if(t.third.intValue() + shownSoFar > StaticDictionary.FIRST_VIRTUAL_OFFSET + -Byte.MIN_VALUE) { continue; }
277
278 // Compute an "escaped" version of the token
279 // that can go in a String literal/constant.
280 String s = t.first;
281 s = s.replace("\\", "\\\\"); // Escape all \ as \\.
282 s = s.replace("\"", "\\\""); // Replace " with \".
283 s = s.replace("\r", "\\r"); // Replace CR with \r.
284 s = s.replace("\n", "\\n"); // Replace LF with \n.
285 s = s.replace("\t", "\\t"); // Replace TAB with \t.
286 // Replace all other control characters with octal escapes.
287 // Note that we cope gracefully with the value expanding while we work.
288 for(int i = 0; i < s.length(); ++i)
289 {
290 final char c = s.charAt(i);
291 if((c < 32) || (c > 126))
292 { s.replace(""+c, "\\" + Integer.toOctalString(c)); /* Replace all occurrences of this char at once. */ }
293 }
294
295 out.println("\"" + s + "\", /* count="+t.second+", saving="+(t.second.intValue()*(t.first.length()-1))+", meanFirstPos="+t.third+" */");
296 if(++shownSoFar == StaticDictionary.MAX_TOKENS) { break; }
297 }
298 }
299 }
300
301 /**Returns true iff this holds an empty string ("").
302 * Can avoid decoding an entire String value just to discover that it is not empty.
303 */
304 public boolean isEmpty() { return(text == null); }
305
306 /**Convert to full String form. */
307 @Override
308 public String toString()
309 {
310 // Simple empty case.
311 if(text == null) { return(""); }
312
313 // We accumulate the latest literal bytes of input here
314 // until we can unambiguously extract one or more full tokens from it.
315 // A surprising number of tokens are > 16 chars (the default SB size)
316 // and accumulating untokenised input may thus often force a (slow) resize.
317 final StringBuilder latestInput = new StringBuilder(32);
318
319 final int len = text.length;
320 final StringBuilder result = new StringBuilder(32 + (len<<3)); // Assume usually <8x compression.
321 final ArrayList<String> tokens = new ArrayList<String>(16 + (len>>1)); // Assume ~2x fewer output tokens than compressed bytes.
322 for(int i = 0; i < len; ++i)
323 {
324 final int ic = text[i];
325
326 // A positive (7-bit) value is a literal/plain character.
327 // It may or may not be part of a multi-byte token.
328 if(ic >= 0)
329 {
330 if(VERBOSE_DEBUG) { System.out.println("LITERAL "+ic+((ic >= 32) ? (" '"+((char)ic)+"'") : "")); }
331 final char c = (char) ic;
332 result.append(c);
333 latestInput.append(c);
334 continue;
335 }
336
337 // Tokenise any accumulated literal characters...
338 if(latestInput.length() > 0)
339 {
340 final List<String> newTokens = tokenise(latestInput);
341 // Handle efficiently the common case of a single new token.
342 if(newTokens.size() == 1) { tokens.add(newTokens.get(0)); }
343 else { tokens.addAll(newTokens); }
344 // ...and clear the accumulated input now that it has been tokenised.
345 latestInput.setLength(0);
346 }
347
348 // We never normally expect a direct reference to the previous token,
349 // so this should indicate a multi-token match code.
350 if(ALLOW_MULTI_TOKEN_MATCHING && (ic == -1))
351 {
352 if(i == len-1)
353 { throw new IllegalStateException("illegal multi-token escape at end of data"); }
354
355 // We are decoding multiple tokens at once.
356 // Hold on to your hats...
357 final int ic2 = text[++i] | ~0xff; // Extend sign from logical 9th bit upwards...
358
359 // Extract the encoded quantities...
360 final int count = (ic2 & ~(~0 << TOKEN_SEQ_LEN_BITS)) + TOKEN_SEQ_LEN_MIN;
361 final int offset = (ic2 >> TOKEN_SEQ_LEN_BITS) - 1;
362 if(VERBOSE_DEBUG) { System.out.println("DECODED SEQ COUNT, OFFSET, OPC: " + count + ", " + offset); }
363
364 // Expand the token sequence...
365 int index = tokens.size() + offset;
366 for(int j = count; --j >= 0; )
367 {
368 final String token = tokens.get(index++);
369 result.append(token);
370 // And note that this is an input token too.
371 tokens.add(token);
372 }
373 continue;
374 }
375
376 // Now replace the back-reference with the single token that it indicates...
377 final int index = tokens.size() + ic;
378 final String token;
379 if(index < 0)
380 {
381 if((getDict() != null) && (index <= StaticDictionary.FIRST_VIRTUAL_OFFSET))
382 {
383 // This may be a reference to a token from the static dictionary.
384 final int vIndex = StaticDictionary.FIRST_VIRTUAL_OFFSET - index;
385 if(vIndex >= getDict().tokens.size())
386 { throw new IllegalStateException("illegal back reference: ic="+ic+", dict.tokens.size()="+getDict().tokens.size()); }
387 token = getDict().tokens.get(vIndex);
388 }
389 else
390 { throw new IllegalStateException("illegal back reference: ic="+ic+", tokens.size()="+tokens.size()); }
391 }
392 else
393 { token = tokens.get(index); }
394 if(VERBOSE_DEBUG) { System.out.println("BACK REF OFFSET "+ic+" = " + token); }
395 result.append(token);
396 // And note that this is an input token too.
397 tokens.add(token);
398 }
399
400 if(VERBOSE_DEBUG) { System.out.println("--- DECODE DONE"); }
401
402 return(result.toString());
403 }
404
405 /**Basic (unshared) representation of 1 byte per character; never empty (is null instead).
406 * Negative values are backward references by the specified number of tokens,
407 * eg -2 means "insert a copy of the token two places back".
408 * <p>
409 * Written out in a customised low-overhead form.
410 */
411 private transient /* final */ byte[] text;
412 //
413 // /**Static dictionary (if any) used to aid compression; null if none.
414 // * It is forbidden to serialise an instance where this is non-null
415 // * so as to prevent decompression errors/ambiguities.
416 // */
417 // private transient final StaticDictionary dict;
418 //
419 // /**Get the static dictionary, null if none. */
420 // protected StaticDictionary getDict() { return(dict); }
421
422 /**Get the static dictionary, null if none.
423 * Always null in the base class since we do not carry a dictionary to save space.
424 */
425 protected StaticDictionary getDict() { return(null); }
426
427 /**Copy of the raw bytes in the internal representation; null if none.
428 * The internal representation is subject to change,
429 * but, with care, this can be used for extra-compact temporary storage.
430 */
431 public byte[] getInternalBytes() { return((text == null) ? null : text.clone()); }
432
433 /**Make a Compact7BitString from the raw getInternalBytes() value; never null.
434 * This is not guaranteed to work if getInternalBytes() is called in one release
435 * and serialised in another, though this will not be broken casually.
436 * <p>
437 * The result is EMPTY or is intern()ed so as to avoid duplicates.
438 * @param dict the static dictionary; can be null
439 */
440 public static Compact7BitString makeFromInternalBytes(final byte raw[], final StaticDictionary dict)
441 {
442 return((raw == null) ? EMPTY :
443 ((dict == null) ? MemoryTools.intern(new Compact7BitString(raw)) :
444 MemoryTools.intern(new WithDict(raw, dict))));
445 }
446
447 /**Make simple un-tokenised byte[] representation.
448 * The byte array must be pure 7-bit ASCII and
449 * it must be guaranteed unshared so that we need not copy it for safety.
450 * <p>
451 * The empty string should be represented with a null array to save space,
452 * not an empty array.
453 */
454 private Compact7BitString(final byte text[] /* , final StaticDictionary dict */ )
455 {
456 this.text = text;
457 //this.dict = dict;
458 //if(IsDebug.isDebug) { System.out.println("C7BS length = " + ((text == null) ? "-" : text.length)); }
459
460 // Verify object state.
461 try { validateObject(); }
462 catch(final InvalidObjectException e)
463 { throw new IllegalArgumentException(e.getMessage(), e); }
464 }
465
466 /**Equal iff the static dictionaries (if any) and compressed texts are identical.
467 * In principle identical char/byte sequences could be compressed in different ways,
468 * but we only treat as equal items whose internal representations are also identical
469 * which makes equality and hashcode computation relatively fast.
470 * <p>
471 * This is designed to work for specialised sub-classes too.
472 */
473 @Override public boolean equals(final Object obj)
474 {
475 if(this == obj) { return(true); }
476 if(null == obj) { return(false); }
477 // Correctly handle the base and derived/specialised variant classes.
478 if(obj.getClass() != this.getClass()) { return(false); }
479 final Compact7BitString other = (Compact7BitString) obj;
480 // Instances with different dictionaries are considered unequal.
481 final StaticDictionary d = getDict();
482 final StaticDictionary otherD = other.getDict();
483 if(d == null) { if(otherD != null) { return(false); } }
484 else if(!d.equals(otherD)) { return(false); }
485 // Equal if the compressed text is identical.
486 return(Arrays.equals(text, other.text));
487 }
488
489 /**Maximum number of bytes to compute a full hash over; strictly positive (or -1 to always compute a full hash).
490 * This should probably be large enough to allow full hashing of the bulk of typical texts
491 * (since a good hash over all content tends to perform best over the whole lifecycle)
492 * especially where they are too short for the length itself to be significant hash-bit source.
493 * <p>
494 * Empirically measured to be fastest at 512 or greater compact()ing live AEP as at 20070402.
495 */
496 private final static int MAX_FULL_HASH_LEN = 512;
497
498 /**Computes the hash code for the text.
499 * The hash is zero for a zero-length (empty) text string.
500 * <p>
501 * By default this hash includes all bytes of the compressed text in the hash,
502 * since that gives best overall performance in hash-based collections.
503 * <p>
504 * However, we cap the time spent generating a hash for very long texts,
505 * hoping to glean some useful full-text information from the compressed length,
506 * and some further useful 'segregation' bits from the dictionary name if any.
507 * <p>
508 * Note that we do NOT cache the computed hash code
509 * since the aim of this representation is to be as compact as possible
510 * and holding an extra word of cached hash would undermine that.
511 */
512 @Override public int hashCode()
513 {
514 // Do the unconditional full hash if not capping hash-computation time.
515 if(MAX_FULL_HASH_LEN < 0) { return(Arrays.hashCode(text)); }
516
517 if(text == null) { return(0); }
518 final int len = text.length;
519 if(len <= MAX_FULL_HASH_LEN) { return(Arrays.hashCode(text)); }
520 int hash = len;
521 // Hash the text backwards from the end until we hit a back-reference
522 // (after hashing a decent-sized chunk of the tail)
523 // which we hope is implicitly imbued with some information from earlier text.
524 int i = len;
525 // Hash a fixed chunk of the tail.
526 final int tailStop = len - MAX_FULL_HASH_LEN/2;
527 assert(tailStop >= 0);
528 while(--i > tailStop)
529 {
530 // We use a different hashing algorithm than Arrays.hashCode()
531 // to minimise collisions with its hashes of short texts.
532 final int b = text[i];
533 hash += ((b + (hash >>> 24)) ^ (hash << 7));
534 }
535 // Hash a bit more until we find a back-reference...
536 while(--i > 0)
537 {
538 // We use a different hashing algorithm than Arrays.hashCode()
539 // to minimise collisions with its hashes of short texts.
540 final int b = text[i];
541 // hash = 7 * hash + b;
542 hash += ((b - 1 - (hash >> 21)) ^ (hash << 5));
543 if(b < 0)
544 {
545 //if(IsDebug.isDebug) { System.err.println("C7BS.hashCode(): stop at "+i+"/"+len); Thread.dumpStack(); }
546 break;
547 }
548 }
549 return(hash);
550 }
551
552 /**Value representing an empty String. */
553 public static final Compact7BitString EMPTY = new Compact7BitString(null);
554
555 /**Deserialise. */
556 private void readObject(final ObjectInputStream in)
557 throws IOException, ClassNotFoundException
558 {
559 final byte l0 = in.readByte();
560 final int length;
561 if(l0 >= 0) { length = l0; }
562 else
563 {
564 final byte l1 = in.readByte();
565 length = -((l0 << 8) + (l1 & 0xff));
566 }
567 // Create and read the text field.
568 if(length == 0) { text = null; }
569 else
570 {
571 text = new byte[length];
572 in.readFully(text);
573 }
574
575 // IMPORTANT:
576 // We assume that the constructor will eventually be called
577 // to do validation and any other setup required.
578 assert(getDict() == null); // We must never deserialise with a dictionary.
579 }
580
581
582 /**Write out a minimally-redundant form of our internal information.
583 * The more-efficient on-the-wire format also makes defensive
584 * reading easier.
585 * <p>
586 * We don't write *any* default state.
587 * <p>
588 * We must never be asked to serialise if we have a (non-null) dictionary.
589 * <p>
590 * We must never be asked to serialise if the text is too long for us to encode.
591 */
592 private void writeObject(final ObjectOutputStream oos)
593 throws IOException
594 {
595 assert (getDict() == null) : "we must not try to serialise if we are using a dictionary";
596 assert ((text == null) || (text.length <= Short.MAX_VALUE)) : "we must not try to serialise a text too long for us to encode";
597
598 // Write the length in a variable-length format.
599 //
600 // A null text is a zero-length text.
601 if(null == text) { oos.writeByte(0); }
602 // For very small strings use a byte-value length.
603 else if(text.length <= Byte.MAX_VALUE) { oos.writeByte(text.length); }
604 // Write larger text sizes using a negative length
605 // so that reading the first byte will give a negative value.
606 else { oos.writeShort(-text.length); }
607
608 // Now write the text bytes directly, unwrapped.
609 if(null != text) { oos.write(text); }
610 }
611
612 /**Deserialise: use constructor for validation, defensive copying, etc. */
613 private Object readResolve()
614 // throws ObjectStreamException
615 {
616 // As a minor optimisation, use EMPTY for any empty value.
617 if(text == null) { return(EMPTY); }
618
619 // The text field is assumed to be guaranteed unshared already.
620 return(MemoryTools.intern(new Compact7BitString(text)));
621 }
622
623 /**Margin that byte[] must be smaller than String.length() for this to be smaller on the wire.
624 * If we get this slightly too large then we will write more Strings than necessary
625 * but may improve compression of multiple instances a little.
626 * <p>
627 * This value represents the overhead of serialising this instance
628 * compared to the special-case String representation
629 * plus some other overheads and potential loss of global compressibility.
630 */
631 private static final int _FUDGE_FACTOR = 64;
632
633 /**Returns the best format to write the contained text to the wire.
634 * We write ourself to the wire as a java.lang.String
635 * if it appears that that would be more efficient and/or safer.
636 * <p>
637 * We almost always write ourself out as a String
638 * assuming that a good general-purpose compressor such as ZIP
639 * will do better than our memory-efficient form given the original text,
640 * but if we manage to find a lot of compressibility
641 * then we'll save ourself directly.
642 * <p>
643 * If a static dictionary is in use then we force serialisation as a String
644 * for safety (to avoid ambiguity).
645 */
646 protected Object writeReplace()
647 throws ObjectStreamException
648 {
649 // A zero-length text is best represented as the (literal, intern()ed) empty String.
650 if(text == null) { return(""); }
651
652 // If a static dictionary has been set, or
653 // if we didn't manage to super-compress the input text
654 // or if the compressed text is too long to represent on the wire,
655 // then serialise this instance as an intern()ed CS8Bit rather than itself
656 // since that is probably near-optimal (in stream UTF-8 format) already,
657 // especially when multiple text instances are compressed in one stream
658 // with a good general-purpose compressor.
659 // CS8 is used for its predictable serialisation behaviour (eg Name might drag in other stuff).
660 final String s = toString();
661 final int sLen = s.length();
662 if((null != getDict()) ||
663 (text.length > Short.MAX_VALUE) ||
664 (text.length + _FUDGE_FACTOR >= sLen) ||
665 (text.length > (sLen>>>4)))
666 { return(MemoryTools.intern(new CS8Bit(s))); }
667
668 if(IsDebug.isDebug) { System.out.println("[C7BS: serialising as self: text.length="+text.length+", toString().length()="+toString().length()+".]"); }
669
670 // Write this instance out as itself.
671 return(this);
672 }
673
674 /**Validate fields/state.
675 * Called in the constructor and possibly after de-serialising.
676 * <p>
677 * Barf if something bad is found.
678 * (Maybe allow some extra info in debug version.)
679 */
680 public void validateObject()
681 throws InvalidObjectException
682 {
683 // We should use a null array rather than an empty one.
684 if((text != null) && (text.length == 0))
685 { throw new InvalidObjectException("bad object: text.length == 0"); }
686 if((text != null) && (text.length > Short.MAX_VALUE))
687 { throw new InvalidObjectException("bad object: text.length > Short.MAX_VALUE"); }
688
689 // TODO: validate content
690 }
691
692 /**Unique serialisation ID. */
693 private static final long serialVersionUID = -5836076660507008468L;
694
695 /**Our tokenising pattern; never null.
696 * This accepts either a "word" or a non-word,
697 * the idea is to compress an alternating sequence of such non-empty tokens.
698 * <p>
699 * The word characters are [a-zA-Z_0-9].
700 * The non-word characters are everything else in the [0,127] 7-bit-ASCII alphabet.
701 * <p>
702 * These tokens need not be strictly alternating
703 * provided that ambiguity can be avoided.
704 * <p>
705 * These tokens are currently not limited in length.
706 * <p>
707 * Inspired by the HuffWord compression tokeniser from "Managing Gigabytes".
708 */
709 // private static final Pattern tokeniser = Pattern.compile("([\\w]+)|([\\W]+)");
710
711 // /**If true, use regex-based tokenisation, else use hand-crafted solution. */
712 // private static final boolean USE_REGEX_TOKENISER = false;
713
714 /**Boolean true/false flags for what are considered "word" chars; initialised on first use.
715 * Also holds intern()ed one-byte Strings for each token
716 * to avoid run-time allocation similar in intent to Integer.valueOf(int).
717 * <p>
718 * The word characters are [a-zA-Z_0-9].
719 * The non-word characters are everything else in the [0,127] 7-bit-ASCII alphabet.
720 * <p>
721 * These tokens need not be strictly alternating
722 * provided that ambiguity can be avoided.
723 * <p>
724 * These tokens are currently not limited in length.
725 * <p>
726 * Inspired by the HuffWord compression tokeniser from "Managing Gigabytes".
727 */
728 private static final class WordChars
729 {
730 private WordChars(){} // Prevent construction.
731
732 /**An entry is true if the corresponding ASCII char of the index is a "word" char. */
733 static final boolean isWordChar[] = new boolean[Byte.MAX_VALUE+1];
734 static
735 {
736 Arrays.fill(isWordChar, '0', '9'+1, true);
737 Arrays.fill(isWordChar, 'A', 'Z'+1, true);
738 Arrays.fill(isWordChar, 'a', 'z'+1, true);
739 isWordChar['_'] = true;
740 }
741
742 /**One-byte intern()ed String for this char. */
743 static final String s[] = new String[Byte.MAX_VALUE+1];
744 static
745 {
746 for(int i = Byte.MAX_VALUE+1; --i >= 0; )
747 { s[i] = String.valueOf((char) i).intern(); }
748 }
749 }
750
751 /**Routine to chop the (non-empty) input CharSequence as an in-order List of String tokens; result may be empty but is never null.
752 * The toString() method may be used to convert all if the input (or subsequences) to a String,
753 * so must behave well.
754 * <p>
755 * The returned list has fast random access, ie with get(index).
756 */
757 private static List<String> tokenise(final CharSequence in)
758 {
759 final int len = in.length();
760 // Handle small inputs specially:
761 // * Empty input is (would be) zero tokens.
762 // * Single byte input is a single token.
763 if(len < 2)
764 {
765 assert(len == 1);
766 // if(len == 0)
767 // {
768 // final List<String> result = Collections.emptyList();
769 // return(result);
770 // }
771 final List<String> result = Collections.singletonList(in.toString());
772 return(result);
773 }
774
775 final ArrayList<String> tokens = new ArrayList<String>(2 + (len>>2));
776
777 int currentTokenStart = 0; // Start of current token...
778 boolean currentTokenIsWord = WordChars.isWordChar[in.charAt(0)];
779 for(int i = 0; ++i < len; )
780 {
781 final boolean thisIsWordChar = WordChars.isWordChar[in.charAt(i)];
782 if(thisIsWordChar != currentTokenIsWord)
783 {
784 // We just started a new token.
785 // Handle common single-character token case efficiently (no allocation).
786 if(i == currentTokenStart+1)
787 { tokens.add(WordChars.s[in.charAt(currentTokenStart)]); }
788 else // Generic (length != 1) case.
789 { tokens.add(in.subSequence(currentTokenStart, i).toString()); }
790 // Note details of new token.
791 currentTokenStart = i;
792 currentTokenIsWord = thisIsWordChar;
793 }
794 }
795 // Create/add the trailing token,
796 // making sure that the common single-char case is efficient.
797 if(len == currentTokenStart+1)
798 { tokens.add(WordChars.s[in.charAt(currentTokenStart)]); }
799 else // Generic (length != 1) case.
800 { tokens.add(in.subSequence(currentTokenStart, len).toString()); }
801 return(tokens);
802 }
803
804 /**If true, then don't try to match the immediately-previous input token.
805 * This is usually not going to be effective
806 * as unless we split tokens then we can never match the previous value
807 * as we alternate between word and non-word.
808 * So not trying to match saves a little bit of time.
809 * <p>
810 * This also frees up the "-1" offset value as an escape,
811 * eg for multi-token-match expansion.
812 */
813 private static final boolean DONT_TRY_TO_MATCH_PREV_TOKEN = true;
814
815 /**If true then allow multi-token matches (MTM) for better compression of very repetitive text.
816 * This is for where a number of tokens in sequence are identical
817 * to some previous (near) token sequence.
818 */
819 private static final boolean ALLOW_MULTI_TOKEN_MATCHING = DONT_TRY_TO_MATCH_PREV_TOKEN;
820
821 /**If true, allow us to do include 1-char tokens in multi-token matches (MTMs).
822 * This should help maximise the compression from multi-token matches,
823 * since many possible multi-token sequences will start with
824 * a single-character token, such as " ".
825 */
826 private static final boolean ALLOW_1_CHAR_MTM = true;
827
828 /**If true, allow partial (prefix) token matching for long (semi-)unique tokens.
829 * Can be slow, but allows a little more compression.
830 */
831 private static final boolean ALLOW_PARTIAL_TOKEN_MATCHING = true;
832
833 /**Bits that we use to represent a token sequence length; strictly positive and less than 8. */
834 private static final int TOKEN_SEQ_LEN_BITS = 3;
835
836 /**Minimum number of sequential tokens that we can encode as a sequence; strictly positive.
837 * Since the minimum encoding is usually 2 bytes
838 * then we set a minimum sequence length longer than this
839 * to guarantee to do better than just a sequence of separate back-references.
840 */
841 private static final int TOKEN_SEQ_LEN_MIN = 3;
842
843 /**Maximum number of sequential tokens that we can encode as a sequence; strictly positive.
844 * Note that we store the sequence length - 2,
845 * since we would use the normal back-reference for a single-token match,
846 * so we use a zero value to indicate a length-of-two sequence.
847 */
848 private static final int TOKEN_SEQ_LEN_MAX = (1 << TOKEN_SEQ_LEN_BITS) - 1 + TOKEN_SEQ_LEN_MIN;
849
850 /**Maximally-negative offset that we can hold above the sequence length in one byte; strictly negative.
851 * We have an implict logical 9th sign bit, always 1 (ie negative).
852 */
853 private static final int TOKEN_SEQ_OFFSET_MIN = (~0xff) >> TOKEN_SEQ_LEN_BITS; // Use sign-preserving right shift.
854
855
856 /**Specialised immutable sub-class to allow use with a StaticDictionary.
857 * Not directly visible outside this class,
858 * but is constructed by convertToCompact7BitString() as necessary.
859 * <p>
860 * Note that this can never be directly (de)serialised
861 * since this forces itself to String form when serialisation is attempted.
862 * This is because we cannot efficiently serialise the dictionary
863 * and thus would not have enough information to ensure correct decoding.
864 */
865 private final static class WithDict extends Compact7BitString
866 {
867 private WithDict(final byte text[], final StaticDictionary dict)
868 {
869 super(text);
870 if(dict == null)
871 { throw new IllegalArgumentException("bad object: null dictionary"); }
872 this.dict = dict;
873 }
874
875 /**Always write as (ie return) a String; never null. */
876 @Override
877 protected Object writeReplace()
878 throws ObjectStreamException
879 {
880 assert(dict != null); // We must write as a String, given that we have a dictionary.
881 return(MemoryTools.intern(toString()));
882 }
883
884 /**Static dictionary (if any) used to aid compression; never null.
885 * It is forbidden to serialise an instance as-is
886 * so as to prevent decompression errors/ambiguities,
887 * so these sub-classes as always serialised as String instances.
888 */
889 private final StaticDictionary dict;
890
891 /**Get the static dictionary, null if none. */
892 @Override
893 protected StaticDictionary getDict() { return(dict); }
894
895 /**Cache of the hash code for speed.
896 * Since we've blown some extra space on the (reference) to the dictionary,
897 * we'll blow a little more on a cache since hash computation can be expensive.
898 * <p>
899 * We assume that it has not been computed and cached if zero.
900 */
901 private int _hash;
902
903 /**Derive (and cache) a hash based on the text and the dictionary.
904 * This adds in some distinguishing sauce from the dictionary
905 * to better spread out hash codes from C7BS texts.
906 */
907 @Override
908 public int hashCode()
909 {
910 int h = _hash;
911 if(h == 0)
912 {
913 // Compute and cache the hash derived from that of the base class
914 // with some bits from the dictionary.
915 h = super.hashCode() - dict.name.hashCode();
916 _hash = h; // Cache the hash for next time.
917 }
918 return(h);
919 }
920
921 /**Serialisation UID; never really used. */
922 private static final long serialVersionUID = -7930788498700479835L;
923 }
924
925 /**Losslessly convert a String entirely consisting of 7-bit ASCII text to a tokenised form for compression/serialisation; not null unless the input is null.
926 * This should be especially effective for XML, but also OK for general text.
927 * <p>
928 * This is designed to be relatively fast and consume little memory.
929 * <p>
930 * Use of a good shared static dictionary between multiple instances
931 * may significantly reduce memory consumption,
932 * but it is not permitted to serialise instances using such a dictionary
933 * other than as a String value.
934 * <p>
935 * This does NOT intern() the result since it would not be appropriate in all cases
936 * (and could add significant CPU and memory overhead).
937 *
938 * @param s the 7-bit ASCII text to encode
939 * @param dict static compression dictionary to use, or null if none
940 *
941 * @throws IllegalArgumentException if the argument is not 7-bit ASCII
942 * ie contains characters outside the range 0--127
943 */
944 public static final Compact7BitString convertToCompact7BitString(final CharSequence s,
945 final StaticDictionary dict)
946 {
947 if(s == null) { return(null); }
948
949 // In the simple, zero-length, case we return the static/shared EMPTY value.
950 final int length = s.length();
951 if(length == 0) { return(EMPTY); }
952
953 // Verify that the input is pure 7-bit.
954 // Veto conversion with IllegalArgumentException if not.
955 for(int i = length; --i >= 0; )
956 {
957 final char c = s.charAt(i);
958 if(c > 127) { throw new IllegalArgumentException("not pure 7-bit ASCII text"); }
959 }
960
961 // Tokenise the input.
962 final List<String> tokensRaw = tokenise(s);
963 assert(tokensRaw instanceof RandomAccess); // We need fast random access into the list.
964 final List<String> tokens;
965 // Without a dictionary we use the tokens list from the tokeniser as-is.
966 if(dict == null)
967 { tokens = tokensRaw; }
968 // With a dictionary, we allow -ve get()s into the static tokens.
969 else
970 {
971 final class VirtualStringList extends AbstractList<String> implements RandomAccess
972 {
973 @Override public final String get(final int index)
974 {
975 // In the simple +ve index case, fetch from the underlying List.
976 if(index >= 0) { return(tokensRaw.get(index)); }
977 // Deal with -ve references into the static dictionary.
978 if(index <= StaticDictionary.FIRST_VIRTUAL_OFFSET)
979 {
980 final int vIndex = StaticDictionary.FIRST_VIRTUAL_OFFSET - index;
981 if(vIndex < dict.tokens.size())
982 { return(dict.tokens.get(vIndex)); }
983 throw new IndexOutOfBoundsException("illegal back reference: " + index);
984 }
985 return(""); // Deal with "gap" at -1 by returning an impossible token...
986 }
987
988 @Override public final void add(final int index, final String element)
989 { tokensRaw.add(index, element); }
990
991 @Override public final String set(final int index, final String element)
992 { return(tokensRaw.set(index, element)); }
993
994 /**Size is reported as that of the underlying "raw" List; -ve indexed items are not counted. */
995 @Override public final int size() { return(tokensRaw.size()); }
996 }
997 tokens = new VirtualStringList();
998 }
999
1000 // Map of all tokens seen so far to last position they were seen in.
1001 // All tokens here will be >1 char, unless ALLOW_1_CHAR_MTM is true.
1002 final HashMap<String,Integer> tokensSeenSoFar = new HashMap<String,Integer>(256 + 2*tokens.size());
1003
1004 // If we have a static dictionary then insert its tokens now.
1005 if(dict != null) { tokensSeenSoFar.putAll(dict.virtualOffsets); }
1006
1007 // Generate the output.
1008 final StringBuilder sb = new StringBuilder(s.length());
1009 main: for(int i = 0; i < tokens.size(); ++i)
1010 {
1011 final String token = tokens.get(i);
1012 final int tokenLength = token.length();
1013 assert(tokenLength > 0);
1014
1015 // We keep this single-character token as-is
1016 // (ie as an uncompressed literal byte)
1017 // unless we are potentially allowing this as
1018 // the start of a multi-token match.
1019 if(!ALLOW_1_CHAR_MTM && (tokenLength == 1))
1020 {
1021 if(VERBOSE_DEBUG) { System.out.println("LITERAL "+((int) token.charAt(0))); }
1022 tokensSeenSoFar.put(token, Integer.valueOf(i));
1023 sb.append(token);
1024 continue;
1025 }
1026
1027 // Current input token-list size.
1028 final int nTokensNow = tokens.size();
1029
1030 // Look for the nearest match backwards in the valid match window.
1031 // But only look if we know that the token has already been seen so far.
1032 final Integer lastPos = tokensSeenSoFar.get(token);
1033 if(lastPos != null)
1034 {
1035 final int lp = lastPos.intValue();
1036 assert(token.equals(tokens.get(lp)));
1037 final int offset = lp - i;
1038 // Ignore matches that we cannot use.
1039 if((offset >= Byte.MIN_VALUE) &&
1040 (!(DONT_TRY_TO_MATCH_PREV_TOKEN && (offset == -1))))
1041 {
1042 assert(token.equals(tokens.get(i+offset)));
1043
1044 if(ALLOW_MULTI_TOKEN_MATCHING)
1045 {
1046 // Look FORWARD and count how many sequential tokens match from here
1047 // (from index j inclusive up to index i exclusive)
1048 // which represent tokens already encoded/sent.
1049 // We may then be able to encode them all in one fell swoop...
1050 int count = 1;
1051 final int endStop = nTokensNow-1 + offset; // Can't search beyond end of the input!
1052 for(int k = lp; ++k <= endStop; )
1053 {
1054 // Stop if the sequence stops matching...
1055 if(!tokens.get(k).equals(tokens.get(k-offset)))
1056 { break; }
1057 // Stop if we reach the maximum sequence length that we can handle.
1058 if(++count == TOKEN_SEQ_LEN_MAX)
1059 { break; }
1060 }
1061 assert((count > 0) && (count <= TOKEN_SEQ_LEN_MAX));
1062 // Handle a long-enough-to-be-worthwhile sequence,
1063 // but only if it is within range...
1064 if(count >= TOKEN_SEQ_LEN_MIN)
1065 {
1066 final int normalisedOffset = offset + 1; // Offset of -1 is never used.
1067 if(normalisedOffset >= TOKEN_SEQ_OFFSET_MIN)
1068 {
1069 if(VERBOSE_DEBUG) { System.out.println("ENCODABLE SEQ COUNT, OFFSET, SEQ: " + count + ", " + offset + ", " + tokens.subList(i, i+count)); }
1070 sb.append((char) -1); // Escape code.
1071 final int normalisedCount = (count - TOKEN_SEQ_LEN_MIN);
1072 assert(normalisedCount < (1 << TOKEN_SEQ_LEN_BITS));
1073 final int encoded = (normalisedOffset << TOKEN_SEQ_LEN_BITS) + normalisedCount;
1074 sb.append((char) encoded); // Only the bottom byte is important.
1075
1076 // Note the first token processed.
1077 tokensSeenSoFar.put(token, Integer.valueOf(i));
1078
1079 // Skip an extra count-1 tokens that we're encoding...
1080 // Note all the (> length 1) tokens that we're skipping...
1081 for(int k = count; --k > 0; )
1082 {
1083 final String skippedToken = tokens.get(++i);
1084 if(ALLOW_1_CHAR_MTM || (skippedToken.length() > 1))
1085 { tokensSeenSoFar.put(skippedToken, Integer.valueOf(i)); }
1086 }
1087
1088 continue main;
1089 }
1090 }
1091 // Fall through to write normal single-token back-reference.
1092
1093 // If the token length is 1
1094 // then we prefer to write the literal over the back-ref
1095 // though it does not matter in fact.
1096 if(tokenLength == 1)
1097 {
1098 if(VERBOSE_DEBUG) { System.out.println("LITERAL "+((int) token.charAt(0))); }
1099 tokensSeenSoFar.put(token, Integer.valueOf(i));
1100 sb.append(token);
1101 continue;
1102 }
1103
1104 // We have a match to a single previous nearby token...
1105 // So write a single-byte relative back-reference rather than the token.
1106 if(VERBOSE_DEBUG) { System.out.println("BACK REF OFFSET "+offset+" = " + token); }
1107 tokensSeenSoFar.put(token, Integer.valueOf(i));
1108 sb.append((char) offset); // Only care about bottom byte...
1109 continue main;
1110 }
1111 }
1112 }
1113
1114 // If we could not find a suitable full-token match
1115 // then we may be able to find a partial match instead
1116 // (providing that this token is long enough to split in two).
1117 if(ALLOW_PARTIAL_TOKEN_MATCHING && (tokenLength > 2))
1118 {
1119 // We prefer to match the longest-possible prefix
1120 // to try to maximise compression.
1121 for(int prefixLength = tokenLength; --prefixLength > 1; )
1122 {
1123 final String prefix = token.substring(0, prefixLength);
1124 final Integer lastPrefixPos = tokensSeenSoFar.get(prefix);
1125 if(lastPrefixPos == null) { continue; }
1126 final int lpp = lastPrefixPos.intValue();
1127 assert(prefix.equals(tokens.get(lpp)));
1128 final int offset = lpp - i;
1129 // Skip matches that we cannot use.
1130 if(offset < Byte.MIN_VALUE) { continue; }
1131 if(DONT_TRY_TO_MATCH_PREV_TOKEN && (offset == -1)) { continue; }
1132 assert(prefix.equals(tokens.get(i+offset)));
1133 // We veto this prefix match (and of all other prefixes of this token)
1134 // if the whole token may be the target of a back-reference later.
1135 if(tokens.subList(i+1, Math.min(nTokensNow, i-Byte.MIN_VALUE)).contains(token)) { break; }
1136 //System.out.println("Prefix match allowed, length: " + prefix.length());
1137 // Adjust tokens list with new split token.
1138 tokens.set(i, prefix);
1139 tokens.add(i+1, token.substring(prefixLength));
1140 if(VERBOSE_DEBUG) { System.out.println("BACK (PREFIX) REF OFFSET "+offset+" = " + token); }
1141 tokensSeenSoFar.put(prefix, Integer.valueOf(i));
1142 sb.append((char) offset); // Only care about bottom byte...
1143 continue main;
1144 }
1145 }
1146
1147 // Log this uncompressed >1-char early token
1148 // if the first occurrence for this compression/input
1149 // if compiling global stats
1150 // and if this is close to the start of the input.
1151 if(LOG_STATS && (dict != null) &&
1152 (tokenLength > 1) && (i < -Byte.MIN_VALUE) &&
1153 (null == tokensSeenSoFar.get(token)))
1154 {
1155 // Create a counter for this token if need be.
1156 dict._uncompEarlyTokens.putIfAbsent(token, new Tuple.Pair<AtomicInteger,AtomicInteger>(new AtomicInteger(),new AtomicInteger()));
1157 // Atomically increment the counter for this token.
1158 final Pair<AtomicInteger, AtomicInteger> pair = dict._uncompEarlyTokens.get(token);
1159 final int tc = pair.first.incrementAndGet();
1160 // Update the summed earliest-position value...
1161 pair.second.addAndGet(i);
1162 if(VERBOSE_DEBUG && ((tc % 1000) == 0)) { System.out.println("[Token count for '"+token+"': "+tc+".]"); }
1163 }
1164
1165 // Now now update tokensSeenSoFar with this token.
1166 tokensSeenSoFar.put(token, Integer.valueOf(i));
1167 // Did not find a (near enough) match, so write token as-is.
1168 sb.append(token);
1169 }
1170 //System.out.println("[Length in (chars) / out (bytes): "+s.length()+"/"+sb.length()+".]");
1171
1172 // Convert back to byte[].
1173 final byte[] text = new byte[sb.length()];
1174 for(int i = text.length; --i >= 0; )
1175 { text[i] = (byte) sb.charAt(i); }
1176
1177 if(VERBOSE_DEBUG) { System.out.println("--- ENCODE DONE"); }
1178 final Compact7BitString result = (dict == null) ?
1179 new Compact7BitString(text) : new WithDict(text, dict);
1180 // assert(s.equals(result.toString())) : "Generated bad compressed form: '"+result+"' from input: '"+s+"'";
1181 // DO NOT intern(): allow the caller to decide if that is appropriate.
1182 return(result);
1183 }
1184 }