Source for java.text.RuleBasedCollator

   1: /* RuleBasedCollator.java -- Concrete Collator Class
   2:    Copyright (C) 1998, 1999, 2000, 2001, 2003, 2004, 2005  Free Software Foundation, Inc.
   3: 
   4: This file is part of GNU Classpath.
   5: 
   6: GNU Classpath is free software; you can redistribute it and/or modify
   7: it under the terms of the GNU General Public License as published by
   8: the Free Software Foundation; either version 2, or (at your option)
   9: any later version.
  10: 
  11: GNU Classpath is distributed in the hope that it will be useful, but
  12: WITHOUT ANY WARRANTY; without even the implied warranty of
  13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14: General Public License for more details.
  15: 
  16: You should have received a copy of the GNU General Public License
  17: along with GNU Classpath; see the file COPYING.  If not, write to the
  18: Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  19: 02110-1301 USA.
  20: 
  21: Linking this library statically or dynamically with other modules is
  22: making a combined work based on this library.  Thus, the terms and
  23: conditions of the GNU General Public License cover the whole
  24: combination.
  25: 
  26: As a special exception, the copyright holders of this library give you
  27: permission to link this library with independent modules to produce an
  28: executable, regardless of the license terms of these independent
  29: modules, and to copy and distribute the resulting executable under
  30: terms of your choice, provided that you also meet, for each linked
  31: independent module, the terms and conditions of the license of that
  32: module.  An independent module is a module which is not derived from
  33: or based on this library.  If you modify this library, you may extend
  34: this exception to your version of the library, but you are not
  35: obligated to do so.  If you do not wish to do so, delete this
  36: exception statement from your version. */
  37: 
  38: 
  39: package java.text;
  40: 
  41: import gnu.classpath.NotImplementedException;
  42: 
  43: import java.util.ArrayList;
  44: import java.util.HashMap;
  45: 
  46: /* Written using "Java Class Libraries", 2nd edition, plus online
  47:  * API docs for JDK 1.2 from http://www.javasoft.com.
  48:  * Status: Believed complete and correct
  49:  */
  50: 
  51: /**
  52:  * This class is a concrete subclass of <code>Collator</code> suitable
  53:  * for string collation in a wide variety of languages.  An instance of
  54:  * this class is normally returned by the <code>getInstance</code> method
  55:  * of <code>Collator</code> with rules predefined for the requested
  56:  * locale.  However, an instance of this class can be created manually
  57:  * with any desired rules.
  58:  * <p>
  59:  * Rules take the form of a <code>String</code> with the following syntax
  60:  * <ul>
  61:  * <li> Modifier: '@'</li>
  62:  * <li> Relation: '&lt;' | ';' | ',' | '=' : &lt;text&gt;</li>
  63:  * <li> Reset: '&amp;' : &lt;text&gt;</li>
  64:  * </ul>
  65:  * The modifier character indicates that accents sort backward as is the
  66:  * case with French.  The modifier applies to all rules <b>after</b>
  67:  * the modifier but before the next primary sequence. If placed at the end
  68:  * of the sequence if applies to all unknown accented character.
  69:  * The relational operators specify how the text
  70:  * argument relates to the previous term.  The relation characters have
  71:  * the following meanings:
  72:  * <ul>
  73:  * <li>'&lt;' - The text argument is greater than the prior term at the primary
  74:  * difference level.</li>
  75:  * <li>';' - The text argument is greater than the prior term at the secondary
  76:  * difference level.</li>
  77:  * <li>',' - The text argument is greater than the prior term at the tertiary
  78:  * difference level.</li>
  79:  * <li>'=' - The text argument is equal to the prior term</li>
  80:  * </ul>
  81:  * <p>
  82:  * As for the text argument itself, this is any sequence of Unicode
  83:  * characters not in the following ranges: 0x0009-0x000D, 0x0020-0x002F,
  84:  * 0x003A-0x0040, 0x005B-0x0060, and 0x007B-0x007E. If these characters are
  85:  * desired, they must be enclosed in single quotes.  If any whitespace is
  86:  * encountered, it is ignored.  (For example, "a b" is equal to "ab").
  87:  * <p>
  88:  * The reset operation inserts the following rule at the point where the
  89:  * text argument to it exists in the previously declared rule string.  This
  90:  * makes it easy to add new rules to an existing string by simply including
  91:  * them in a reset sequence at the end.  Note that the text argument, or
  92:  * at least the first character of it, must be present somewhere in the
  93:  * previously declared rules in order to be inserted properly.  If this
  94:  * is not satisfied, a <code>ParseException</code> will be thrown.
  95:  * <p>
  96:  * This system of configuring <code>RuleBasedCollator</code> is needlessly
  97:  * complex and the people at Taligent who developed it (along with the folks
  98:  * at Sun who accepted it into the Java standard library) deserve a slow
  99:  * and agonizing death.
 100:  * <p>
 101:  * Here are a couple of example of rule strings:
 102:  * <p>
 103:  * "&lt; a &lt; b &lt; c" - This string says that a is greater than b which is
 104:  * greater than c, with all differences being primary differences.
 105:  * <p>
 106:  * "&lt; a,A &lt; b,B &lt; c,C" - This string says that 'A' is greater than 'a' with
 107:  * a tertiary strength comparison.  Both 'b' and 'B' are greater than 'a' and
 108:  * 'A' during a primary strength comparison.  But 'B' is greater than 'b'
 109:  * under a tertiary strength comparison.
 110:  * <p>
 111:  * "&lt; a &lt; c &amp; a &lt; b " - This sequence is identical in function to the
 112:  * "&lt; a &lt; b &lt; c" rule string above.  The '&amp;' reset symbol indicates that
 113:  * the rule "&lt; b" is to be inserted after the text argument "a" in the
 114:  * previous rule string segment.
 115:  * <p>
 116:  * "&lt; a &lt; b &amp; y &lt; z" - This is an error.  The character 'y' does not appear
 117:  * anywhere in the previous rule string segment so the rule following the
 118:  * reset rule cannot be inserted.
 119:  * <p>
 120:  * "&lt; a &amp; A @ &lt; e &amp; E &lt; f&amp; F" - This sequence is equivalent to the following
 121:  * "&lt; a &amp; A &lt; E &amp; e &lt; f &amp; F".
 122:  * <p>
 123:  * For a description of the various comparison strength types, see the
 124:  * documentation for the <code>Collator</code> class.
 125:  * <p>
 126:  * As an additional complication to this already overly complex rule scheme,
 127:  * if any characters precede the first rule, these characters are considered
 128:  * ignorable.  They will be treated as if they did not exist during
 129:  * comparisons.  For example, "- &lt; a &lt; b ..." would make '-' an ignorable
 130:  * character such that the strings "high-tech" and "hightech" would
 131:  * be considered identical.
 132:  * <p>
 133:  * A <code>ParseException</code> will be thrown for any of the following
 134:  * conditions:
 135:  * <ul>
 136:  * <li>Unquoted punctuation characters in a text argument.</li>
 137:  * <li>A relational or reset operator not followed by a text argument</li>
 138:  * <li>A reset operator where the text argument is not present in
 139:  * the previous rule string section.</li>
 140:  * </ul>
 141:  *
 142:  * @author Aaron M. Renn (arenn@urbanophile.com)
 143:  * @author Tom Tromey (tromey@cygnus.com)
 144:  * @author Guilhem Lavaux (guilhem@kaffe.org)
 145:  */
 146: public class RuleBasedCollator extends Collator
 147: {
 148:   /**
 149:    * This class describes what rank has a character (or a sequence of characters)
 150:    * in the lexicographic order. Each element in a rule has a collation element.
 151:    */
 152:   static final class CollationElement
 153:   {
 154:     final String key;
 155:     final int primary;
 156:     final short secondary;
 157:     final short tertiary;
 158:     final short equality;
 159:     final boolean ignore;
 160:     final String expansion;
 161: 
 162:     CollationElement(String key, int primary, short secondary, short tertiary,
 163:                      short equality, String expansion, boolean ignore)
 164:     {
 165:       this.key = key;
 166:       this.primary = primary;
 167:       this.secondary = secondary;
 168:       this.tertiary = tertiary;
 169:       this.equality = equality;
 170:       this.ignore = ignore;
 171:       this.expansion = expansion;
 172:     }
 173: 
 174:     int getValue()
 175:     {
 176:       return (primary << 16) + (secondary << 8) + tertiary;
 177:     }
 178:   }
 179: 
 180:   /**
 181:    * Basic collation instruction (internal format) to build the series of
 182:    * collation elements. It contains an instruction which specifies the new
 183:    * state of the generator. The sequence of instruction should not contain
 184:    * RESET (it is used by
 185:    * {@link #mergeRules(int,java.lang.String,java.util.ArrayList,java.util.ArrayList)})
 186:    * as a temporary state while merging two sets of instructions.
 187:    */
 188:   private static final class CollationSorter
 189:   {
 190:     static final int GREATERP = 0;
 191:     static final int GREATERS = 1;
 192:     static final int GREATERT = 2;
 193:     static final int EQUAL = 3;
 194:     static final int RESET = 4;
 195:     static final int INVERSE_SECONDARY = 5;
 196: 
 197:     final int comparisonType;
 198:     final String textElement;
 199:     final int hashText;
 200:     final int offset;
 201:     final boolean ignore;
 202: 
 203:     String expansionOrdering;
 204: 
 205:     private CollationSorter(final int comparisonType, final String textElement,
 206:                             final int offset, final boolean ignore)
 207:     {
 208:       this.comparisonType = comparisonType;
 209:       this.textElement = textElement;
 210:       this.offset = offset;
 211:       this.ignore = ignore;
 212:       hashText = textElement.hashCode();
 213:     }
 214:   }
 215: 
 216:   /**
 217:    * This is the original rule string.
 218:    */
 219:   private String rules;
 220: 
 221:   /**
 222:    * This is the table of collation element values
 223:    */
 224:   private CollationElement[] ce_table;
 225: 
 226:   /**
 227:    * Quick-prefix finder.
 228:    */
 229:   HashMap<String,CollationElement> prefix_tree;
 230: 
 231:   /**
 232:    * This is the value of the last sequence entered into
 233:    * <code>ce_table</code>. It is used to compute the
 234:    * ordering value of unspecified character.
 235:    */
 236:   private int last_primary_value;
 237: 
 238:   /**
 239:    * This is the value of the last secondary sequence of the
 240:    * primary 0, entered into
 241:    * <code>ce_table</code>. It is used to compute the
 242:    * ordering value of an unspecified accented character.
 243:    */
 244:   private int last_tertiary_value;
 245: 
 246:   /**
 247:    * This variable is true if accents need to be sorted
 248:    * in the other direction.
 249:    */
 250:   private boolean inverseAccentComparison;
 251: 
 252:   /**
 253:    * This collation element is special to unknown sequence.
 254:    * The JDK uses it to mark and sort the characters which has
 255:    * no collation rules.
 256:    */
 257:   static final CollationElement SPECIAL_UNKNOWN_SEQ =
 258:     new CollationElement("", (short) 32767, (short) 0, (short) 0,
 259:                          (short) 0, null, false);
 260: 
 261:   /**
 262:    * This method initializes a new instance of <code>RuleBasedCollator</code>
 263:    * with the specified collation rules.  Note that an application normally
 264:    * obtains an instance of <code>RuleBasedCollator</code> by calling the
 265:    * <code>getInstance</code> method of <code>Collator</code>.  That method
 266:    * automatically loads the proper set of rules for the desired locale.
 267:    *
 268:    * @param rules The collation rule string.
 269:    *
 270:    * @exception ParseException If the rule string contains syntax errors.
 271:    */
 272:   public RuleBasedCollator(String rules) throws ParseException
 273:   {
 274:     if (rules.equals(""))
 275:       throw new ParseException("empty rule set", 0);
 276: 
 277:     this.rules = rules;
 278: 
 279:     buildCollationVector(parseString(rules));
 280:     buildPrefixAccess();
 281:   }
 282: 
 283:   /**
 284:    * This method returns the number of common characters at the beginning
 285:    * of the string of the two parameters.
 286:    *
 287:    * @param prefix A string considered as a prefix to test against
 288:    * the other string.
 289:    * @param s A string to test the prefix against.
 290:    * @return The number of common characters.
 291:    */
 292:   static int findPrefixLength(String prefix, String s)
 293:   {
 294:     int index;
 295:     int len = prefix.length();
 296: 
 297:     for (index = 0; index < len && index < s.length(); ++index)
 298:       {
 299:         if (prefix.charAt(index) != s.charAt(index))
 300:           return index;
 301:       }
 302: 
 303: 
 304:     return index;
 305:   }
 306: 
 307:   /**
 308:    * Here we are merging two sets of sorting instructions: 'patch' into 'main'. This methods
 309:    * checks whether it is possible to find an anchor point for the rules to be merged and
 310:    * then insert them at that precise point.
 311:    *
 312:    * @param offset Offset in the string containing rules of the beginning of the rules
 313:    * being merged in.
 314:    * @param starter Text of the rules being merged.
 315:    * @param main Repository of all already parsed rules.
 316:    * @param patch Rules to be merged into the repository.
 317:    * @throws ParseException if it is impossible to find an anchor point for the new rules.
 318:    */
 319:   private void mergeRules(int offset, String starter, ArrayList<CollationSorter> main,
 320:                           ArrayList<CollationSorter> patch)
 321:     throws ParseException
 322:   {
 323:     int insertion_point = -1;
 324:     int max_length = 0;
 325: 
 326:     /* We must check that no rules conflict with another already present. If it
 327:      * is the case delete the old rule.
 328:      */
 329: 
 330:     /* For the moment good old O(N^2) algorithm.
 331:      */
 332:     for (int i = 0; i < patch.size(); i++)
 333:       {
 334:         int j = 0;
 335: 
 336:         while (j < main.size())
 337:           {
 338:             CollationSorter rule1 = patch.get(i);
 339:             CollationSorter rule2 = main.get(j);
 340: 
 341:             if (rule1.textElement.equals(rule2.textElement))
 342:               main.remove(j);
 343:             else
 344:               j++;
 345:           }
 346:       }
 347: 
 348:     // Find the insertion point... O(N)
 349:     for (int i = 0; i < main.size(); i++)
 350:       {
 351:         CollationSorter sorter = main.get(i);
 352:         int length = findPrefixLength(starter, sorter.textElement);
 353: 
 354:         if (length > max_length)
 355:           {
 356:             max_length = length;
 357:             insertion_point = i+1;
 358:           }
 359:       }
 360: 
 361:     if (insertion_point < 0)
 362:       throw new ParseException("no insertion point found for " + starter, offset);
 363: 
 364:     if (max_length < starter.length())
 365:       {
 366:         /*
 367:          * We need to expand the first entry. It must be sorted
 368:          * like if it was the reference key itself (like the spec
 369:          * said. So the first entry is special: the element is
 370:          * replaced by the specified text element for the sorting.
 371:          * This text replace the old one for comparisons. However
 372:          * to preserve the behaviour we replace the first key (corresponding
 373:          * to the found prefix) by a new code rightly ordered in the
 374:          * sequence. The rest of the subsequence must be appended
 375:          * to the end of the sequence.
 376:          */
 377:         CollationSorter sorter = patch.get(0);
 378: 
 379:         sorter.expansionOrdering = starter.substring(max_length); // Skip the first good prefix element
 380: 
 381:         main.add(insertion_point, sorter);
 382: 
 383:         /*
 384:          * This is a new set of rules. Append to the list.
 385:          */
 386:         patch.remove(0);
 387:         insertion_point++;
 388:       }
 389: 
 390:     // Now insert all elements of patch at the insertion point.
 391:     for (int i = 0; i < patch.size(); i++)
 392:       main.add(i+insertion_point, patch.get(i));
 393:   }
 394: 
 395:   /**
 396:    * This method parses a string and build a set of sorting instructions. The parsing
 397:    * may only be partial on the case the rules are to be merged sometime later.
 398:    *
 399:    * @param stop_on_reset If this parameter is true then the parser stops when it
 400:    * encounters a reset instruction. In the other case, it tries to parse the subrules
 401:    * and merged it in the same repository.
 402:    * @param v Output vector for the set of instructions.
 403:    * @param base_offset Offset in the string to begin parsing.
 404:    * @param rules Rules to be parsed.
 405:    * @return -1 if the parser reached the end of the string, an integer representing the
 406:    * offset in the string at which it stopped parsing.
 407:    * @throws ParseException if something turned wrong during the parsing. To get details
 408:    * decode the message.
 409:    */
 410:   private int subParseString(boolean stop_on_reset, ArrayList<CollationSorter> v,
 411:                              int base_offset, String rules)
 412:     throws ParseException
 413:   {
 414:     boolean ignoreChars = (base_offset == 0);
 415:     int operator = -1;
 416:     StringBuilder sb = new StringBuilder();
 417:     boolean doubleQuote = false;
 418:     boolean eatingChars = false;
 419:     boolean nextIsModifier = false;
 420:     boolean isModifier = false;
 421:     int i;
 422: 
 423: main_parse_loop:
 424:     for (i = 0; i < rules.length(); i++)
 425:       {
 426:         char c = rules.charAt(i);
 427:         int type = -1;
 428: 
 429:         if (!eatingChars &&
 430:             ((c >= 0x09 && c <= 0x0D) || (c == 0x20)))
 431:               continue;
 432: 
 433:         isModifier = nextIsModifier;
 434:         nextIsModifier = false;
 435: 
 436:         if (eatingChars && c != '\'')
 437:           {
 438:             doubleQuote = false;
 439:             sb.append(c);
 440:             continue;
 441:           }
 442:         if (doubleQuote && eatingChars)
 443:           {
 444:             sb.append(c);
 445:             doubleQuote = false;
 446:             continue;
 447:           }
 448: 
 449:         switch (c)
 450:           {
 451:           case '!':
 452:             throw new ParseException
 453:               ("Modifier '!' is not yet supported by Classpath", i + base_offset);
 454:           case '<':
 455:             type = CollationSorter.GREATERP;
 456:             break;
 457:           case ';':
 458:             type = CollationSorter.GREATERS;
 459:             break;
 460:           case ',':
 461:             type = CollationSorter.GREATERT;
 462:             break;
 463:           case '=':
 464:             type = CollationSorter.EQUAL;
 465:             break;
 466:           case '\'':
 467:             eatingChars = !eatingChars;
 468:             doubleQuote = true;
 469:             break;
 470:           case '@':
 471:             if (ignoreChars)
 472:               throw new ParseException
 473:                 ("comparison list has not yet been started. You may only use"
 474:                  + "(<,;=&)", i + base_offset);
 475:             // Inverse the order of secondaries from now on.
 476:             nextIsModifier = true;
 477:             type = CollationSorter.INVERSE_SECONDARY;
 478:             break;
 479:           case '&':
 480:             type = CollationSorter.RESET;
 481:             if (stop_on_reset)
 482:               break main_parse_loop;
 483:             break;
 484:           default:
 485:             if (operator < 0)
 486:               throw new ParseException
 487:                 ("operator missing at " + (i + base_offset), i + base_offset);
 488:             if (! eatingChars
 489:                 && ((c >= 0x21 && c <= 0x2F)
 490:                     || (c >= 0x3A && c <= 0x40)
 491:                     || (c >= 0x5B && c <= 0x60)
 492:                     || (c >= 0x7B && c <= 0x7E)))
 493:               throw new ParseException
 494:                 ("unquoted punctuation character '" + c + "'", i + base_offset);
 495: 
 496:             //type = ignoreChars ? CollationSorter.IGNORE : -1;
 497:             sb.append(c);
 498:             break;
 499:           }
 500: 
 501:         if (type  < 0)
 502:           continue;
 503: 
 504:         if (operator < 0)
 505:           {
 506:             operator = type;
 507:             continue;
 508:           }
 509: 
 510:         if (sb.length() == 0 && !isModifier)
 511:           throw new ParseException
 512:             ("text element empty at " + (i+base_offset), i+base_offset);
 513: 
 514:         if (operator == CollationSorter.RESET)
 515:           {
 516:             /* Reposition in the sorting list at the position
 517:              * indicated by the text element.
 518:              */
 519:             String subrules = rules.substring(i);
 520:             ArrayList<CollationSorter> sorted_rules = new ArrayList<CollationSorter>();
 521:             int idx;
 522: 
 523:             // Parse the subrules but do not iterate through all
 524:             // sublist. This is the privilege of the first call.
 525:             idx = subParseString(true, sorted_rules, base_offset+i, subrules);
 526: 
 527:             // Merge new parsed rules into the list.
 528:             mergeRules(base_offset+i, sb.toString(), v, sorted_rules);
 529:             sb.setLength(0);
 530: 
 531:             // Reset state to none.
 532:             operator = -1;
 533:             type = -1;
 534:             // We have found a new subrule at 'idx' but it has not been parsed.
 535:             if (idx >= 0)
 536:               {
 537:                 i += idx-1;
 538:                 continue main_parse_loop;
 539:               }
 540:             else
 541:                 // No more rules.
 542:                 break main_parse_loop;
 543:           }
 544: 
 545:         String textElement = sb.toString();
 546:         if (operator == CollationSorter.GREATERP)
 547:           ignoreChars = false;
 548:         CollationSorter sorter = new CollationSorter(operator, textElement,
 549:                                                      base_offset + rules.length(),
 550:                                                      ignoreChars);
 551:         sb.setLength(0);
 552: 
 553:         v.add(sorter);
 554:         operator = type;
 555:       }
 556: 
 557:     if (operator >= 0)
 558:       {
 559:         int pos = rules.length() + base_offset;
 560: 
 561:         if ((sb.length() != 0 && nextIsModifier)
 562:             || (sb.length() == 0 && !nextIsModifier && !eatingChars))
 563:           throw new ParseException("text element empty at " + pos, pos);
 564: 
 565:         if (operator == CollationSorter.GREATERP)
 566:           ignoreChars = false;
 567: 
 568:         CollationSorter sorter = new CollationSorter(operator, sb.toString(),
 569:                                                      base_offset+pos, ignoreChars);
 570:         v.add(sorter);
 571:       }
 572: 
 573:     if (i == rules.length())
 574:       return -1;
 575:     else
 576:       return i;
 577:   }
 578: 
 579:   /**
 580:    * This method creates a copy of this object.
 581:    *
 582:    * @return A copy of this object.
 583:    */
 584:   public Object clone()
 585:   {
 586:     return super.clone();
 587:   }
 588: 
 589:   /**
 590:    * This method completely parses a string 'rules' containing sorting rules.
 591:    *
 592:    * @param rules String containing the rules to be parsed.
 593:    * @return A set of sorting instructions stored in a Vector.
 594:    * @throws ParseException if something turned wrong during the parsing. To get details
 595:    * decode the message.
 596:    */
 597:   private ArrayList<CollationSorter> parseString(String rules)
 598:     throws ParseException
 599:   {
 600:     ArrayList<CollationSorter> v = new ArrayList<CollationSorter>();
 601: 
 602:     // result of the first subParseString is not absolute (may be -1 or a
 603:     // positive integer). But we do not care.
 604:     subParseString(false, v, 0, rules);
 605: 
 606:     return v;
 607:   }
 608: 
 609:   /**
 610:    * This method uses the sorting instructions built by {@link #parseString}
 611:    * to build collation elements which can be directly used to sort strings.
 612:    *
 613:    * @param parsedElements Parsed instructions stored in a ArrayList.
 614:    * @throws ParseException if the order of the instructions are not valid.
 615:    */
 616:   private void buildCollationVector(ArrayList<CollationSorter> parsedElements)
 617:     throws ParseException
 618:   {
 619:     int primary_seq = 0;
 620:     int last_tertiary_seq = 0;
 621:     short secondary_seq = 0;
 622:     short tertiary_seq = 0;
 623:     short equality_seq = 0;
 624:     boolean inverseComparisons = false;
 625:     final boolean DECREASING = false;
 626:     final boolean INCREASING = true;
 627:     boolean secondaryType = INCREASING;
 628:     ArrayList<CollationElement> v = new ArrayList<CollationElement>();
 629: 
 630:     // elts is completely sorted.
 631: element_loop:
 632:     for (int i = 0; i < parsedElements.size(); i++)
 633:       {
 634:         CollationSorter elt = parsedElements.get(i);
 635: 
 636:         switch (elt.comparisonType)
 637:           {
 638:           case CollationSorter.GREATERP:
 639:             primary_seq++;
 640:             if (inverseComparisons)
 641:               {
 642:                 secondary_seq = Short.MAX_VALUE;
 643:                 secondaryType = DECREASING;
 644:               }
 645:             else
 646:               {
 647:                 secondary_seq = 0;
 648:                 secondaryType = INCREASING;
 649:               }
 650:             tertiary_seq = 0;
 651:             equality_seq = 0;
 652:             inverseComparisons = false;
 653:             break;
 654:           case CollationSorter.GREATERS:
 655:             if (secondaryType == DECREASING)
 656:               secondary_seq--;
 657:             else
 658:               secondary_seq++;
 659:             tertiary_seq = 0;
 660:             equality_seq = 0;
 661:             break;
 662:           case CollationSorter.INVERSE_SECONDARY:
 663:             inverseComparisons = true;
 664:             continue element_loop;
 665:           case CollationSorter.GREATERT:
 666:             tertiary_seq++;
 667:             if (primary_seq == 0)
 668:               last_tertiary_seq = tertiary_seq;
 669:             equality_seq = 0;
 670:             break;
 671:           case CollationSorter.EQUAL:
 672:             equality_seq++;
 673:             break;
 674:           case CollationSorter.RESET:
 675:             throw new ParseException
 676:               ("Invalid reached state 'RESET'. Internal error", elt.offset);
 677:           default:
 678:             throw new ParseException
 679:               ("Invalid unknown state '" + elt.comparisonType + "'", elt.offset);
 680:           }
 681: 
 682:         v.add(new CollationElement(elt.textElement, primary_seq,
 683:                                    secondary_seq, tertiary_seq,
 684:                                    equality_seq, elt.expansionOrdering, elt.ignore));
 685:       }
 686: 
 687:     this.inverseAccentComparison = inverseComparisons;
 688: 
 689:     ce_table = v.toArray(new CollationElement[v.size()]);
 690: 
 691:     last_primary_value = primary_seq+1;
 692:     last_tertiary_value = last_tertiary_seq+1;
 693:   }
 694: 
 695:   /**
 696:    * Build a tree where all keys are the texts of collation elements and data is
 697:    * the collation element itself. The tree is used when extracting all prefix
 698:    * for a given text.
 699:    */
 700:   private void buildPrefixAccess()
 701:   {
 702:     prefix_tree = new HashMap<String,CollationElement>();
 703: 
 704:     for (int i = 0; i < ce_table.length; i++)
 705:       {
 706:         CollationElement e = ce_table[i];
 707: 
 708:         prefix_tree.put(e.key, e);
 709:       }
 710:   }
 711: 
 712:   /**
 713:    * This method returns an integer which indicates whether the first
 714:    * specified <code>String</code> is less than, greater than, or equal to
 715:    * the second.  The value depends not only on the collation rules in
 716:    * effect, but also the strength and decomposition settings of this object.
 717:    *
 718:    * @param source The first <code>String</code> to compare.
 719:    * @param target A second <code>String</code> to compare to the first.
 720:    *
 721:    * @return A negative integer if source &lt; target, a positive integer
 722:    * if source &gt; target, or 0 if source == target.
 723:    */
 724:   public int compare(String source, String target)
 725:   {
 726:     CollationElementIterator cs, ct;
 727:     CollationElement ord1block = null;
 728:     CollationElement ord2block = null;
 729:     boolean advance_block_1 = true;
 730:     boolean advance_block_2 = true;
 731: 
 732:     cs = getCollationElementIterator(source);
 733:     ct = getCollationElementIterator(target);
 734: 
 735:     for(;;)
 736:       {
 737:         int ord1;
 738:         int ord2;
 739: 
 740:         /*
 741:          * We have to check whether the characters are ignorable.
 742:          * If it is the case then forget them.
 743:          */
 744:         if (advance_block_1)
 745:           {
 746:             ord1block = cs.nextBlock();
 747:             if (ord1block != null && ord1block.ignore)
 748:               continue;
 749:           }
 750: 
 751:         if (advance_block_2)
 752:           {
 753:             ord2block = ct.nextBlock();
 754:             if (ord2block != null && ord2block.ignore)
 755:               {
 756:                 advance_block_1 = false;
 757:                 continue;
 758:               }
 759:          }
 760:         else
 761:           advance_block_2 = true;
 762: 
 763:         if (!advance_block_1)
 764:           advance_block_1 = true;
 765: 
 766:         if (ord1block != null)
 767:           ord1 = ord1block.getValue();
 768:         else
 769:           {
 770:             if (ord2block == null)
 771:               return 0;
 772:             return -1;
 773:           }
 774: 
 775:         if (ord2block == null)
 776:           return 1;
 777: 
 778:         ord2 = ord2block.getValue();
 779: 
 780:         // We know chars are totally equal, so skip
 781:         if (ord1 == ord2)
 782:           {
 783:             if (getStrength() == IDENTICAL)
 784:               if (!ord1block.key.equals(ord2block.key))
 785:                 return ord1block.key.compareTo(ord2block.key);
 786:             continue;
 787:           }
 788: 
 789:         // Check for primary strength differences
 790:         int prim1 = CollationElementIterator.primaryOrder(ord1);
 791:         int prim2 = CollationElementIterator.primaryOrder(ord2);
 792: 
 793:         if (prim1 == 0 && getStrength() < TERTIARY)
 794:           {
 795:             advance_block_2 = false;
 796:             continue;
 797:           }
 798:         else if (prim2 == 0 && getStrength() < TERTIARY)
 799:           {
 800:             advance_block_1 = false;
 801:             continue;
 802:           }
 803: 
 804:         if (prim1 < prim2)
 805:           return -1;
 806:         else if (prim1 > prim2)
 807:           return 1;
 808:         else if (getStrength() == PRIMARY)
 809:           continue;
 810: 
 811:         // Check for secondary strength differences
 812:         int sec1 = CollationElementIterator.secondaryOrder(ord1);
 813:         int sec2 = CollationElementIterator.secondaryOrder(ord2);
 814: 
 815:         if (sec1 < sec2)
 816:           return -1;
 817:         else if (sec1 > sec2)
 818:           return 1;
 819:         else if (getStrength() == SECONDARY)
 820:           continue;
 821: 
 822:         // Check for tertiary differences
 823:         int tert1 = CollationElementIterator.tertiaryOrder(ord1);
 824:         int tert2 = CollationElementIterator.tertiaryOrder(ord2);
 825: 
 826:         if (tert1 < tert2)
 827:           return -1;
 828:         else if (tert1 > tert2)
 829:           return 1;
 830:         else if (getStrength() == TERTIARY)
 831:           continue;
 832: 
 833:         // Apparently JDK does this (at least for my test case).
 834:         return ord1block.key.compareTo(ord2block.key);
 835:       }
 836:   }
 837: 
 838:   /**
 839:    * This method tests this object for equality against the specified
 840:    * object.  This will be true if and only if the specified object is
 841:    * another reference to this object.
 842:    *
 843:    * @param obj The <code>Object</code> to compare against this object.
 844:    *
 845:    * @return <code>true</code> if the specified object is equal to this object,
 846:    * <code>false</code> otherwise.
 847:    */
 848:   public boolean equals(Object obj)
 849:   {
 850:     if (obj == this)
 851:       return true;
 852:     else
 853:       return false;
 854:   }
 855: 
 856:   /**
 857:    * This method builds a default collation element without invoking
 858:    * the database created from the rules passed to the constructor.
 859:    *
 860:    * @param c Character which needs a collation element.
 861:    * @return A valid brand new CollationElement instance.
 862:    */
 863:   CollationElement getDefaultElement(char c)
 864:   {
 865:     int v;
 866: 
 867:     // Preliminary support for generic accent sorting inversion (I don't know if all
 868:     // characters in the range should be sorted backward). This is the place
 869:     // to fix this if needed.
 870:     if (inverseAccentComparison && (c >= 0x02B9 && c <= 0x0361))
 871:       v = 0x0361 - ((int) c - 0x02B9);
 872:     else
 873:       v = (short) c;
 874:     return new CollationElement("" + c, last_primary_value + v,
 875:                                 (short) 0, (short) 0, (short) 0, null, false);
 876:   }
 877: 
 878:   /**
 879:    * This method builds a default collation element for an accented character
 880:    * without invoking the database created from the rules passed to the constructor.
 881:    *
 882:    * @param c Character which needs a collation element.
 883:    * @return A valid brand new CollationElement instance.
 884:    */
 885:   CollationElement getDefaultAccentedElement(char c)
 886:   {
 887:     int v;
 888: 
 889:     // Preliminary support for generic accent sorting inversion (I don't know if all
 890:     // characters in the range should be sorted backward). This is the place
 891:     // to fix this if needed.
 892:     if (inverseAccentComparison && (c >= 0x02B9 && c <= 0x0361))
 893:       v = 0x0361 - ((int) c - 0x02B9);
 894:     else
 895:       v = (short) c;
 896:     return new CollationElement("" + c, (short) 0,
 897:                                 (short) 0, (short) (last_tertiary_value + v), (short) 0, null, false);
 898:   }
 899: 
 900:   /**
 901:    * This method returns an instance for <code>CollationElementIterator</code>
 902:    * for the specified <code>String</code> under the collation rules for this
 903:    * object.
 904:    *
 905:    * @param source The <code>String</code> to return the
 906:    * <code>CollationElementIterator</code> instance for.
 907:    *
 908:    * @return A <code>CollationElementIterator</code> for the specified
 909:    * <code>String</code>.
 910:    */
 911:   public CollationElementIterator getCollationElementIterator(String source)
 912:   {
 913:     return new CollationElementIterator(this, source);
 914:   }
 915: 
 916:   /**
 917:    * This method returns an instance of <code>CollationElementIterator</code>
 918:    * for the <code>String</code> represented by the specified
 919:    * <code>CharacterIterator</code>.
 920:    *
 921:    * @param source The <code>CharacterIterator</code> with the desired <code>String</code>.
 922:    *
 923:    * @return A <code>CollationElementIterator</code> for the specified <code>String</code>.
 924:    */
 925:   public CollationElementIterator getCollationElementIterator(CharacterIterator source)
 926:   {
 927:     return new CollationElementIterator(this, source);
 928:   }
 929: 
 930:   /**
 931:    * This method returns an instance of <code>CollationKey</code> for the
 932:    * specified <code>String</code>.  The object returned will have a
 933:    * more efficient mechanism for its comparison function that could
 934:    * provide speed benefits if multiple comparisons are performed, such
 935:    * as during a sort.
 936:    *
 937:    * @param source The <code>String</code> to create a <code>CollationKey</code> for.
 938:    *
 939:    * @return A <code>CollationKey</code> for the specified <code>String</code>.
 940:    */
 941:   public CollationKey getCollationKey(String source)
 942:   {
 943:     CollationElementIterator cei = getCollationElementIterator(source);
 944:     ArrayList<Integer> vect = new ArrayList<Integer>();
 945: 
 946:     int ord = cei.next();
 947:     cei.reset(); //set to start of string
 948: 
 949:     while (ord != CollationElementIterator.NULLORDER)
 950:       {
 951:         // If the primary order is null, it means this is an ignorable
 952:         // character.
 953:         if (CollationElementIterator.primaryOrder(ord) == 0)
 954:           {
 955:             ord = cei.next();
 956:             continue;
 957:           }
 958:         switch (getStrength())
 959:           {
 960:             case PRIMARY:
 961:               ord = CollationElementIterator.primaryOrder(ord);
 962:               break;
 963: 
 964:             case SECONDARY:
 965:               ord = CollationElementIterator.primaryOrder(ord) << 8;
 966:               ord |= CollationElementIterator.secondaryOrder(ord);
 967: 
 968:             default:
 969:                break;
 970:           }
 971: 
 972:         vect.add(Integer.valueOf(ord));
 973:         ord = cei.next(); //increment to next key
 974:       }
 975: 
 976:     Integer[] objarr = vect.toArray(new Integer[vect.size()]);
 977:     byte[] key = new byte[objarr.length * 4];
 978: 
 979:     for (int i = 0; i < objarr.length; i++)
 980:       {
 981:         int j = objarr[i].intValue();
 982:         key [i * 4] = (byte) ((j & 0xFF000000) >> 24);
 983:         key [i * 4 + 1] = (byte) ((j & 0x00FF0000) >> 16);
 984:         key [i * 4 + 2] = (byte) ((j & 0x0000FF00) >> 8);
 985:         key [i * 4 + 3] = (byte) (j & 0x000000FF);
 986:       }
 987: 
 988:     return new CollationKey(this, source, key);
 989:   }
 990: 
 991:   /**
 992:    * This method returns a <code>String</code> containing the collation rules
 993:    * for this object.
 994:    *
 995:    * @return The collation rules for this object.
 996:    */
 997:   public String getRules()
 998:   {
 999:     return rules;
1000:   }
1001: 
1002:   /**
1003:    * This method returns a hash value for this object.
1004:    *
1005:    * @return A hash value for this object.
1006:    */
1007:   public int hashCode()
1008:   {
1009:     return System.identityHashCode(this);
1010:   }
1011: }
Overview Package Class Use Source Tree Index Deprecated About
		Frames \| No Frames