From 57807c317869610e07a85bf8bf95747638230ed7 Mon Sep 17 00:00:00 2001 From: Michael Koch Date: Mon, 31 May 2004 22:16:31 +0000 Subject: [PATCH] CollationElementIterator.java, [...]: New versions from GNU classpath. 2004-06-01 Michael Koch * java/text/CollationElementIterator.java, java/text/CollationKey.java, java/text/RuleBasedCollator.java: New versions from GNU classpath. * testsuite/libjava.mauve/xfails: Removed all java.text.CollationElementIterator tests. From-SVN: r82510 --- libjava/ChangeLog | 8 + libjava/java/text/CollationElementIterator.java | 269 ++++++- libjava/java/text/CollationKey.java | 28 +- libjava/java/text/RuleBasedCollator.java | 925 +++++++++++++++++------- libjava/testsuite/libjava.mauve/xfails | 100 --- 5 files changed, 943 insertions(+), 387 deletions(-) diff --git a/libjava/ChangeLog b/libjava/ChangeLog index d0b99bf..35592a7 100644 --- a/libjava/ChangeLog +++ b/libjava/ChangeLog @@ -1,5 +1,13 @@ 2004-06-01 Michael Koch + * java/text/CollationElementIterator.java, + java/text/CollationKey.java, + java/text/RuleBasedCollator.java: New versions from GNU classpath. + * testsuite/libjava.mauve/xfails: Removed all + java.text.CollationElementIterator tests. + +2004-06-01 Michael Koch + * java/util/zip/InflaterInputStream.java: Merged more with Classpath version. * java/util/zip/ZipOutputStream.java (): Renamed enum to e to removed diff --git a/libjava/java/text/CollationElementIterator.java b/libjava/java/text/CollationElementIterator.java index 1b5a617..a4c2100 100644 --- a/libjava/java/text/CollationElementIterator.java +++ b/libjava/java/text/CollationElementIterator.java @@ -38,6 +38,8 @@ exception statement from your version. */ package java.text; +import java.util.Vector; + /* Written using "Java Class Libraries", 2nd edition, plus online * API docs for JDK 1.2 from http://www.javasoft.com. * Status: Believed complete and correct to JDK 1.1. @@ -74,13 +76,25 @@ public final class CollationElementIterator String text; /** + * This is the index into the collation decomposition where we are currently scanning. + */ + int index; + + /** * This is the index into the String where we are currently scanning. */ int textIndex; - // A piece of lookahead. - boolean lookahead_set; - int lookahead; + /** + * Array containing the collation decomposition of the + * text given to the constructor. + */ + private Object[] text_decomposition; + + /** + * Array containing the index of the specified block. + */ + private int[] text_indexes; /** * This method initializes a new instance of CollationElementIterator @@ -97,6 +111,35 @@ public final class CollationElementIterator setText (text); } + RuleBasedCollator.CollationElement nextBlock() + { + if (index >= text_decomposition.length) + return null; + + RuleBasedCollator.CollationElement e = + (RuleBasedCollator.CollationElement) text_decomposition[index]; + + textIndex = text_indexes[index+1]; + + index++; + + return e; + } + + RuleBasedCollator.CollationElement previousBlock() + { + if (index == 0) + return null; + + index--; + RuleBasedCollator.CollationElement e = + (RuleBasedCollator.CollationElement) text_decomposition[index]; + + textIndex = text_indexes[index+1]; + + return e; + } + /** * This method returns the collation ordering value of the next character sequence * in the string (it may be an extended character following collation rules). @@ -107,10 +150,29 @@ public final class CollationElementIterator */ public int next() { - if (textIndex == text.length()) + RuleBasedCollator.CollationElement e = nextBlock(); + + if (e == null) return NULLORDER; + + return e.getValue(); + } + + /** + * This method returns the collation ordering value of the previous character + * in the string. This method will return NULLORDER if the + * beginning of the string was reached. + * + * @return The collation ordering value. + */ + public int previous() + { + RuleBasedCollator.CollationElement e = previousBlock(); - return collator.ceiNext (this); + if (e == null) + return NULLORDER; + + return e.getValue(); } /** @@ -133,9 +195,8 @@ public final class CollationElementIterator */ public void reset() { + index = 0; textIndex = 0; - lookahead_set = false; - lookahead = 0; } /** @@ -176,10 +237,152 @@ public final class CollationElementIterator */ public void setText(String text) { + int idx = 0; + int idx_idx = 0; + int alreadyExpanded = 0; + int idxToMove = 0; + this.text = text; - this.textIndex = 0; - this.lookahead_set = false; - this.lookahead = 0; + this.index = 0; + + String work_text = text.intern(); + + Vector v = new Vector(); + Vector vi = new Vector(); + + // Build element collection ordered as they come in "text". + while (idx < work_text.length()) + { + String key, key_old; + + Object object = null; + int p = 1; + + // IMPROVE: use a TreeMap with a prefix-ordering rule. + key_old = key = null; + do + { + if (object != null) + key_old = key; + key = work_text.substring (idx, idx+p); + object = collator.prefix_tree.get (key); + if (object != null && idx < alreadyExpanded) + { + RuleBasedCollator.CollationElement prefix = (RuleBasedCollator.CollationElement)object; + if (prefix.expansion != null && + prefix.expansion.startsWith(work_text.substring(0, idx))) + { + object = null; + key = key_old; + } + } + p++; + } + while (idx+p <= work_text.length()); + + if (object == null) + key = key_old; + + RuleBasedCollator.CollationElement prefix = + (RuleBasedCollator.CollationElement) collator.prefix_tree.get (key); + + /* + * First case: There is no such sequence in the database. + * We will have to build one from the context. + */ + if (prefix == null) + { + /* + * We are dealing with sequences in an expansion. They + * are treated as accented characters (tertiary order). + */ + if (alreadyExpanded > 0) + { + RuleBasedCollator.CollationElement e = + collator.getDefaultAccentedElement (work_text.charAt (idx)); + + v.add (e); + vi.add (new Integer(idx_idx)); + idx++; + alreadyExpanded--; + if (alreadyExpanded == 0) + { + /* There is not any characters left in the expansion set. + * We can increase the pointer in the source string. + */ + idx_idx += idxToMove; + idxToMove = 0; + } + else + idx_idx++; + } + else + { + /* This is a normal character. */ + RuleBasedCollator.CollationElement e = + collator.getDefaultElement (work_text.charAt (idx)); + Integer i_ref = new Integer(idx_idx); + + /* Don't forget to mark it as a special sequence so the + * string can be ordered. + */ + v.add (RuleBasedCollator.SPECIAL_UNKNOWN_SEQ); + vi.add (i_ref); + v.add (e); + vi.add (i_ref); + idx_idx++; + idx++; + } + continue; + } + + /* + * Second case: Here we have found a matching sequence. + * Here we have an expansion string prepend it to the "work text" and + * add the corresponding sorting element. We must also mark + */ + if (prefix.expansion != null) + { + work_text = prefix.expansion + + work_text.substring (idx+prefix.key.length()); + idx = 0; + v.add (prefix); + vi.add (new Integer(idx_idx)); + if (alreadyExpanded == 0) + idxToMove = prefix.key.length(); + alreadyExpanded += prefix.expansion.length()-prefix.key.length(); + } + else + { + /* Third case: the simplest. We have got the prefix and it + * has not to be expanded. + */ + v.add (prefix); + vi.add (new Integer(idx_idx)); + idx += prefix.key.length(); + /* If the sequence is in an expansion, we must decrease the + * counter. + */ + if (alreadyExpanded > 0) + { + alreadyExpanded -= prefix.key.length(); + if (alreadyExpanded == 0) + { + idx_idx += idxToMove; + idxToMove = 0; + } + } else + idx_idx += prefix.key.length(); + } + } + + text_decomposition = v.toArray(); + text_indexes = new int[vi.size()+1]; + for (int i = 0; i < vi.size(); i++) + { + text_indexes[i] = ((Integer)vi.elementAt(i)).intValue(); + } + text_indexes[vi.size()] = text.length(); } /** @@ -215,4 +418,50 @@ public final class CollationElementIterator { return textIndex; } + + /** + * This method sets the iteration index position into the current + * String to the specified value. This value must not + * be negative and must not be greater than the last index position + * in the String. + * + * @param offset The new iteration index position. + * + * @exception IllegalArgumentException If the new offset is not valid. + */ + public void setOffset(int offset) + { + if (offset < 0) + throw new IllegalArgumentException("Negative offset: " + offset); + + if (offset > (text.length() - 1)) + throw new IllegalArgumentException("Offset too large: " + offset); + + for (index = 0; index < text_decomposition.length; index++) + { + if (offset <= text_indexes[index]) + break; + } + /* + * As text_indexes[0] == 0, we should not have to take care whether index is + * greater than 0. It is always. + */ + if (text_indexes[index] == offset) + textIndex = offset; + else + textIndex = text_indexes[index-1]; + } + + /** + * This method returns the maximum length of any expansion sequence that + * ends with the specified collation order value. (Whatever that means). + * + * @param value The collation order value + * + * @param The maximum length of an expansion sequence. + */ + public int getMaxExpansion(int value) + { + return 1; + } } diff --git a/libjava/java/text/CollationKey.java b/libjava/java/text/CollationKey.java index abc28b2..4e6aca2 100644 --- a/libjava/java/text/CollationKey.java +++ b/libjava/java/text/CollationKey.java @@ -78,24 +78,13 @@ public final class CollationKey implements Comparable /** * This is the bit value for this key. */ - private int[] key; + private byte[] key; - CollationKey(Collator collator, CollationElementIterator iter, - String originalText, int strength) + CollationKey (Collator collator, String originalText, byte[] key) { this.collator = collator; this.originalText = originalText; - - // Compute size of required array. - int size = 0; - while (RuleBasedCollator.next(iter, strength) - != CollationElementIterator.NULLORDER) - ++size; - - iter.reset(); - key = new int[size]; - for (int i = 0; i < size; i++) - key[i] = RuleBasedCollator.next(iter, strength); + this.key = key; } /** @@ -205,15 +194,6 @@ public final class CollationKey implements Comparable */ public byte[] toByteArray() { - byte[] r = new byte[4 * key.length]; - int off = 0; - for (int i = 0; i < key.length; ++i) - { - r[off++] = (byte) ((key[i] >>> 24) & 255); - r[off++] = (byte) ((key[i] >>> 16) & 255); - r[off++] = (byte) ((key[i] >>> 8) & 255); - r[off++] = (byte) ((key[i] ) & 255); - } - return r; + return key; } } diff --git a/libjava/java/text/RuleBasedCollator.java b/libjava/java/text/RuleBasedCollator.java index 64c2ac2..23f238e 100644 --- a/libjava/java/text/RuleBasedCollator.java +++ b/libjava/java/text/RuleBasedCollator.java @@ -38,7 +38,7 @@ exception statement from your version. */ package java.text; import java.util.Enumeration; -import java.util.Hashtable; +import java.util.HashMap; import java.util.Vector; /* Written using "Java Class Libraries", 2nd edition, plus online @@ -147,32 +147,104 @@ public class RuleBasedCollator extends Collator * This class describes what rank has a character (or a sequence of characters) * in the lexicographic order. Each element in a rule has a collation element. */ - final class CollationElement + final static class CollationElement { String key; - char relation; - - CollationElement(String key, char relation) + int primary; + short secondary; + short tertiary; + short equality; + boolean ignore; + String expansion; + + CollationElement(String key, int primary, short secondary, short tertiary, + short equality, String expansion, boolean ignore) { this.key = key; - this.relation = relation; + this.primary = primary; + this.secondary = secondary; + this.tertiary = tertiary; + this.equality = equality; + this.ignore = ignore; + this.expansion = expansion; + } + + final int getValue() + { + return (primary << 16) + (secondary << 8) + tertiary; } } - // True if we are using French-style accent ordering. - private boolean frenchAccents; + /** + * Basic collation instruction (internal format) to build the series of + * collation elements. It contains an instruction which specifies the new + * state of the generator. The sequence of instruction should not contain + * RESET (it is used by + * {@link #mergeRules(int,java.lang.String,java.util.Vector,java.util.Vector)}) + * as a temporary state while merging two sets of instructions. + */ + final static class CollationSorter + { + static final int GREATERP = 0; + static final int GREATERS = 1; + static final int GREATERT = 2; + static final int EQUAL = 3; + static final int RESET = 4; + static final int INVERSE_SECONDARY = 5; + + int comparisonType; + String textElement; + int hashText; + int offset; + boolean ignore; + + String expansionOrdering; + } /** * This the the original rule string. */ private String rules; - // This maps strings onto collation values. - private Hashtable map; - - // An entry in this hash means that more lookahead is required for - // the prefix string. - private Hashtable prefixes; + /** + * This is the table of collation element values + */ + private Object[] ce_table; + + /** + * Quick-prefix finder. + */ + HashMap prefix_tree; + + /** + * This is the value of the last sequence entered into + * ce_table. It is used to compute the + * ordering value of unspecified character. + */ + private int last_primary_value; + + /** + * This is the value of the last secondary sequence of the + * primary 0, entered into + * ce_table. It is used to compute the + * ordering value of an unspecified accented character. + */ + private int last_tertiary_value; + + /** + * This variable is true if accents need to be sorted + * in the other direction. + */ + private boolean inverseAccentComparison; + + /** + * This collation element is special to unknown sequence. + * The JDK uses it to mark and sort the characters which has + * no collation rules. + */ + static final CollationElement SPECIAL_UNKNOWN_SEQ = + new CollationElement("", (short) 32767, (short) 0, (short) 0, + (short) 0, null, false); /** * This method initializes a new instance of RuleBasedCollator @@ -192,128 +264,313 @@ public class RuleBasedCollator extends Collator this.rules = rules; - // We keep each rule in order in a vector. At the end we traverse - // the vector and compute collation values from it. - int insertion_index = 0; - Vector vec = new Vector (); + buildCollationVector(parseString(rules)); + buildPrefixAccess(); + } + /** + * This method returns the number of common characters at the beginning + * of the string of the two parameters. + * + * @param prefix A string considered as a prefix to test against + * the other string. + * @param s A string to test the prefix against. + * @return The number of common characters. + */ + static int findPrefixLength(String prefix, String s) + { int index; - StringBuffer argument = new StringBuffer(); - int len = rules.length(); + int len = prefix.length(); - for (index = 0; index < len; ++index) + for (index = 0; index < len && index < s.length(); ++index) { - char c = rules.charAt(index); + if (prefix.charAt(index) != s.charAt(index)) + return index; + } - // Just skip whitespace. - if (Character.isWhitespace(c)) - continue; - // Modifier. - if (c == '@') + return index; + } + + /** + * Here we are merging two sets of sorting instructions: 'patch' into 'main'. This methods + * checks whether it is possible to find an anchor point for the rules to be merged and + * then insert them at that precise point. + * + * @param offset Offset in the string containing rules of the beginning of the rules + * being merged in. + * @param starter Text of the rules being merged. + * @param main Repository of all already parsed rules. + * @param patch Rules to be merged into the repository. + * @throws ParseException if it is impossible to find an anchor point for the new rules. + */ + private void mergeRules(int offset, String starter, Vector main, Vector patch) + throws ParseException + { + Enumeration elements = main.elements(); + int insertion_point = -1; + int max_length = 0; + + /* We must check that no rules conflict with another already present. If it + * is the case delete the old rule. + */ + + /* For the moment good old O(N^2) algorithm. + */ + for (int i = 0; i < patch.size(); i++) + { + int j = 0; + + while (j < main.size()) { - frenchAccents = true; - continue; + CollationSorter rule1 = (CollationSorter) patch.elementAt(i); + CollationSorter rule2 = (CollationSorter) main.elementAt(j); + + if (rule1.textElement.equals(rule2.textElement)) + main.removeElementAt(j); + else + j++; } + } - // Check for relation or reset operator. - if (! (c == '<' || c == ';' || c == ',' || c == '=' || c == '&')) - throw new ParseException ("invalid character", index); - - ++index; - while (index < len) + // Find the insertion point... O(N) + for (int i = 0; i < main.size(); i++) + { + CollationSorter sorter = (CollationSorter) main.elementAt(i); + int length = findPrefixLength(starter, sorter.textElement); + + if (length > max_length) { - if (! Character.isWhitespace(rules.charAt(index))) - break; - ++index; + max_length = length; + insertion_point = i+1; } - if (index == len) - throw new ParseException ("missing argument", index); - - int save = index; - index = text_argument (rules, index, argument); - if (argument.length() == 0) - throw new ParseException ("invalid character", save); - String arg = argument.toString(); - int item_index = -1; - - for (int j = 0; j < vec.size(); ++j) - { - CollationElement e = (CollationElement) vec.elementAt (j); + } - if (arg.equals (e.key)) - { - item_index = j; - break; - } - } + if (insertion_point < 0) + throw new ParseException("no insertion point found for " + starter, offset); + + if (max_length < starter.length()) + { + /* + * We need to expand the first entry. It must be sorted + * like if it was the reference key itself (like the spec + * said. So the first entry is special: the element is + * replaced by the specified text element for the sorting. + * This text replace the old one for comparisons. However + * to preserve the behaviour we replace the first key (corresponding + * to the found prefix) by a new code rightly ordered in the + * sequence. The rest of the subsequence must be appended + * to the end of the sequence. + */ + CollationSorter sorter = (CollationSorter) patch.elementAt(0); + CollationSorter expansionPrefix = + (CollationSorter) main.elementAt(insertion_point-1); - if (c != '&') + sorter.expansionOrdering = starter.substring(max_length); // Skip the first good prefix element + + main.insertElementAt(sorter, insertion_point); + + /* + * This is a new set of rules. Append to the list. + */ + patch.removeElementAt(0); + insertion_point++; + } + + // Now insert all elements of patch at the insertion point. + for (int i = 0; i < patch.size(); i++) + main.insertElementAt(patch.elementAt(i), i+insertion_point); + } + + /** + * This method parses a string and build a set of sorting instructions. The parsing + * may only be partial on the case the rules are to be merged sometime later. + * + * @param stop_on_reset If this parameter is true then the parser stops when it + * encounters a reset instruction. In the other case, it tries to parse the subrules + * and merged it in the same repository. + * @param v Output vector for the set of instructions. + * @param base_offset Offset in the string to begin parsing. + * @param rules Rules to be parsed. + * @return -1 if the parser reached the end of the string, an integer representing the + * offset in the string at which it stopped parsing. + * @throws ParseException if something turned wrong during the parsing. To get details + * decode the message. + */ + private int subParseString(boolean stop_on_reset, Vector v, + int base_offset, String rules) + throws ParseException + { + boolean ignoreChars = (base_offset == 0); + int operator = -1; + StringBuffer sb = new StringBuffer(); + boolean doubleQuote = false; + boolean eatingChars = false; + boolean nextIsModifier = false; + boolean isModifier = false; + int i; + +main_parse_loop: + for (i = 0; i < rules.length(); i++) + { + char c = rules.charAt(i); + int type = -1; + + if (!eatingChars && + ((c >= 0x09 && c <= 0x0D) || (c == 0x20))) + continue; + + isModifier = nextIsModifier; + nextIsModifier = false; + + if (eatingChars && c != '\'') { - // If the argument already appears in the vector, then we - // must remove it in order to re-order. - if (item_index != -1) - { - vec.removeElementAt(item_index); - if (insertion_index >= item_index) - --insertion_index; - } - CollationElement r = new CollationElement (arg, c); - vec.insertElementAt(r, insertion_index); - ++insertion_index; + doubleQuote = false; + sb.append(c); + continue; } - else + if (doubleQuote && eatingChars) { - // Reset. - if (item_index == -1) - throw - new ParseException ("argument to reset not previously seen", - save); - insertion_index = item_index + 1; + sb.append(c); + doubleQuote = false; + continue; } - // Ugly: in this case the resulting INDEX comes from - // text_argument, which returns the index of the next - // character we should examine. - --index; - } + switch (c) { + case '!': + throw new ParseException + ("Modifier '!' is not yet supported by Classpath", i+base_offset); + case '<': + type = CollationSorter.GREATERP; + break; + case ';': + type = CollationSorter.GREATERS; + break; + case ',': + type = CollationSorter.GREATERT; + break; + case '=': + type = CollationSorter.EQUAL; + break; + case '\'': + eatingChars = !eatingChars; + doubleQuote = true; + break; + case '@': + if (ignoreChars) + throw new ParseException + ("comparison list has not yet been started. You may only use" + + "(<,;=&)", i+base_offset); + // Inverse the order of secondaries from now on. + nextIsModifier = true; + type = CollationSorter.INVERSE_SECONDARY; + break; + case '&': + type = CollationSorter.RESET; + if (stop_on_reset) + break main_parse_loop; + break; + default: + if (operator < 0) + throw new ParseException + ("operator missing at " + (i+base_offset), i+base_offset); + if (!eatingChars && + ((c >= 0x21 && c <= 0x2F) + || (c >= 0x3A && c <= 0x40) + || (c >= 0x5B && c <= 0x60) + || (c >= 0x7B && c <= 0x7E))) + throw new ParseException + ("unquoted punctuation character '"+c+"'", i+base_offset); + + //type = ignoreChars ? CollationSorter.IGNORE : -1; + sb.append(c); + break; + } - // Now construct a hash table that maps strings onto their - // collation values. - int primary = 0; - int secondary = 0; - int tertiary = 0; - this.map = new Hashtable (); - this.prefixes = new Hashtable (); - Enumeration e = vec.elements(); - while (e.hasMoreElements()) - { - CollationElement r = (CollationElement) e.nextElement(); - switch (r.relation) + if (type < 0) + continue; + + if (operator < 0) { - case '<': - ++primary; - secondary = 0; - tertiary = 0; - break; - case ';': - ++secondary; - tertiary = 0; - break; - case ',': - ++tertiary; - break; - case '=': - break; + operator = type; + continue; } - // This must match CollationElementIterator. - map.put(r.key, new Integer (primary << 16 - | secondary << 8 | tertiary)); - // Make a map of all lookaheads we might need. - for (int i = r.key.length() - 1; i >= 1; --i) - prefixes.put(r.key.substring(0, i), Boolean.TRUE); + if (sb.length() == 0 && !isModifier) + throw new ParseException + ("text element empty at " + (i+base_offset), i+base_offset); + + if (operator == CollationSorter.RESET) + { + /* Reposition in the sorting list at the position + * indicated by the text element. + */ + String subrules = rules.substring(i); + Vector sorted_rules = new Vector(); + int idx; + + // Parse the subrules but do not iterate through all + // sublist. This is the priviledge of the first call. + idx = subParseString(true, sorted_rules, base_offset+i, subrules); + + // Merge new parsed rules into the list. + mergeRules(base_offset+i, sb.toString(), v, sorted_rules); + sb.setLength(0); + + // Reset state to none. + operator = -1; + type = -1; + // We have found a new subrule at 'idx' but it has not been parsed. + if (idx >= 0) + { + i += idx-1; + continue main_parse_loop; + } + else + // No more rules. + break main_parse_loop; + } + + CollationSorter sorter = new CollationSorter(); + + if (operator == CollationSorter.GREATERP) + ignoreChars = false; + + sorter.comparisonType = operator; + sorter.textElement = sb.toString(); + sorter.hashText = sorter.textElement.hashCode(); + sorter.offset = base_offset+rules.length(); + sorter.ignore = ignoreChars; + sb.setLength(0); + + v.add(sorter); + operator = type; + } + + if (operator >= 0) + { + CollationSorter sorter = new CollationSorter(); + int pos = rules.length() + base_offset; + + if ((sb.length() != 0 && nextIsModifier) + || (sb.length() == 0 && !nextIsModifier && !eatingChars)) + throw new ParseException("text element empty at " + pos, pos); + + if (operator == CollationSorter.GREATERP) + ignoreChars = false; + + sorter.comparisonType = operator; + sorter.textElement = sb.toString(); + sorter.hashText = sorter.textElement.hashCode(); + sorter.offset = base_offset+pos; + sorter.ignore = ignoreChars; + v.add(sorter); } + + if (i == rules.length()) + return -1; + else + return i; } /** @@ -323,90 +580,130 @@ public class RuleBasedCollator extends Collator */ public Object clone() { - RuleBasedCollator c = (RuleBasedCollator) super.clone (); - c.map = (Hashtable) map.clone (); - c.prefixes = (Hashtable) map.clone (); - return c; + return super.clone(); } - // A helper for CollationElementIterator.next(). - int ceiNext (CollationElementIterator cei) + /** + * This method completely parses a string 'rules' containing sorting rules. + * + * @param rules String containing the rules to be parsed. + * @return A set of sorting instructions stored in a Vector. + * @throws ParseException if something turned wrong during the parsing. To get details + * decode the message. + */ + private Vector parseString(String rules) + throws ParseException { - if (cei.lookahead_set) - { - cei.lookahead_set = false; - return cei.lookahead; - } + Vector v = new Vector(); - int save = cei.textIndex; - int max = cei.text.length(); - String s = null; - - // It is possible to have a case where `abc' has a mapping, but - // neither `ab' nor `abd' do. In this case we must treat `abd' as - // nothing special. - boolean found = false; + // result of the first subParseString is not absolute (may be -1 or a + // positive integer). But we do not care. + subParseString(false, v, 0, rules); + + return v; + } - int i; - for (i = save + 1; i <= max; ++i) + /** + * This method uses the sorting instructions built by {@link #parseString} + * to build collation elements which can be directly used to sort strings. + * + * @param parsedElements Parsed instructions stored in a Vector. + * @throws ParseException if the order of the instructions are not valid. + */ + private void buildCollationVector(Vector parsedElements) + throws ParseException + { + int primary_seq = 0; + int last_tertiary_seq = 0; + short secondary_seq = 0; + short tertiary_seq = 0; + short equality_seq = 0; + boolean inverseComparisons = false; + final boolean DECREASING = false; + final boolean INCREASING = true; + boolean secondaryType = INCREASING; + Vector v = new Vector(); + + // elts is completely sorted. +element_loop: + for (int i = 0; i < parsedElements.size(); i++) { - s = cei.text.substring(save, i); - if (prefixes.get(s) == null) - break; - found = true; - } - // Assume s != null. + CollationSorter elt = (CollationSorter) parsedElements.elementAt(i); + boolean ignoreChar = false; - Object obj = map.get(s); - // The special case. - while (found && obj == null && s.length() > 1) - { - --i; - s = cei.text.substring(save, i); - obj = map.get(s); + switch (elt.comparisonType) + { + case CollationSorter.GREATERP: + primary_seq++; + if (inverseComparisons) + { + secondary_seq = Short.MAX_VALUE; + secondaryType = DECREASING; + } + else + { + secondary_seq = 0; + secondaryType = INCREASING; + } + tertiary_seq = 0; + equality_seq = 0; + inverseComparisons = false; + break; + case CollationSorter.GREATERS: + if (secondaryType == DECREASING) + secondary_seq--; + else + secondary_seq++; + tertiary_seq = 0; + equality_seq = 0; + break; + case CollationSorter.INVERSE_SECONDARY: + inverseComparisons = true; + continue element_loop; + case CollationSorter.GREATERT: + tertiary_seq++; + if (primary_seq == 0) + last_tertiary_seq = tertiary_seq; + equality_seq = 0; + break; + case CollationSorter.EQUAL: + equality_seq++; + break; + case CollationSorter.RESET: + throw new ParseException + ("Invalid reached state 'RESET'. Internal error", elt.offset); + default: + throw new ParseException + ("Invalid unknown state '" + elt.comparisonType + "'", elt.offset); + } + + v.add(new CollationElement(elt.textElement, primary_seq, + secondary_seq, tertiary_seq, + equality_seq, elt.expansionOrdering, elt.ignore)); } - // Update state. - cei.textIndex = i; + this.inverseAccentComparison = inverseComparisons; - if (obj == null) - { - // This idea, and the values, come from JDK. - // assert (s.length() == 1) - cei.lookahead_set = true; - cei.lookahead = s.charAt(0) << 8; - return 0x7fff << 16; - } + ce_table = v.toArray(); - return ((Integer) obj).intValue(); + last_primary_value = primary_seq+1; + last_tertiary_value = last_tertiary_seq+1; } - // A helper for compareTo() that returns the next character that has - // a nonzero ordering at the indicated strength. This is also used - // in CollationKey. - static final int next (CollationElementIterator iter, int strength) + /** + * Build a tree where all keys are the texts of collation elements and data is + * the collation element itself. The tree is used when extracting all prefix + * for a given text. + */ + private void buildPrefixAccess() { - while (true) + prefix_tree = new HashMap(); + + for (int i = 0; i < ce_table.length; i++) { - int os = iter.next(); - if (os == CollationElementIterator.NULLORDER) - return os; - int c = 0; - switch (strength) - { - case PRIMARY: - c = os & ~0xffff; - break; - case SECONDARY: - c = os & ~0x00ff; - break; - case TERTIARY: - case IDENTICAL: - c = os; - break; - } - if (c != 0) - return c; + CollationElement e = (CollationElement) ce_table[i]; + + prefix_tree.put(e.key, e); } } @@ -425,34 +722,115 @@ public class RuleBasedCollator extends Collator public int compare(String source, String target) { CollationElementIterator cs, ct; + CollationElement ord1block = null; + CollationElement ord2block = null; + boolean advance_block_1 = true; + boolean advance_block_2 = true; - cs = new CollationElementIterator(this, source); - ct = new CollationElementIterator(this, target); + cs = getCollationElementIterator(source); + ct = getCollationElementIterator(target); for(;;) { - int os = next (cs, strength); - int ot = next (ct, strength); + int ord1; + int ord2; + + /* + * We have to check whether the characters are ignorable. + * If it is the case then forget them. + */ + if (advance_block_1) + { + ord1block = cs.nextBlock(); + if (ord1block != null && ord1block.ignore) + continue; + } + + if (advance_block_2) + { + ord2block = ct.nextBlock(); + if (ord2block != null && ord2block.ignore) + { + advance_block_1 = false; + continue; + } + } + else + advance_block_2 = true; - if (os == CollationElementIterator.NULLORDER - && ot == CollationElementIterator.NULLORDER) - break; - else if (os == CollationElementIterator.NULLORDER) + if (!advance_block_1) + advance_block_1 = true; + + if (ord1block != null) + ord1 = ord1block.getValue(); + else { - // Source string is shorter, so return "less than". + if (ord2block == null) + return 0; return -1; } - else if (ot == CollationElementIterator.NULLORDER) + + if (ord2block == null) + return 1; + + ord2 = ord2block.getValue(); + + // We know chars are totally equal, so skip + if (ord1 == ord2) { - // Target string is shorter, so return "greater than". - return 1; + if (getStrength() == IDENTICAL) + if (!ord1block.key.equals(ord2block.key)) + return ord1block.key.compareTo(ord2block.key); + continue; } - if (os != ot) - return os - ot; - } + // Check for primary strength differences + int prim1 = CollationElementIterator.primaryOrder(ord1); + int prim2 = CollationElementIterator.primaryOrder(ord2); + + if (prim1 == 0 && getStrength() < TERTIARY) + { + advance_block_2 = false; + continue; + } + else if (prim2 == 0 && getStrength() < TERTIARY) + { + advance_block_1 = false; + continue; + } + + if (prim1 < prim2) + return -1; + else if (prim1 > prim2) + return 1; + else if (getStrength() == PRIMARY) + continue; + + // Check for secondary strength differences + int sec1 = CollationElementIterator.secondaryOrder(ord1); + int sec2 = CollationElementIterator.secondaryOrder(ord2); + + if (sec1 < sec2) + return -1; + else if (sec1 > sec2) + return 1; + else if (getStrength() == SECONDARY) + continue; + + // Check for tertiary differences + int tert1 = CollationElementIterator.tertiaryOrder(ord1); + int tert2 = CollationElementIterator.tertiaryOrder(ord2); + + if (tert1 < tert2) + return -1; + else if (tert1 > tert2) + return 1; + else if (getStrength() == TERTIARY) + continue; - return 0; + // Apparently JDK does this (at least for my test case). + return ord1block.key.compareTo(ord2block.key); + } } /** @@ -467,13 +845,54 @@ public class RuleBasedCollator extends Collator */ public boolean equals(Object obj) { - if (! (obj instanceof RuleBasedCollator) || ! super.equals(obj)) + if (obj == this) + return true; + else return false; - RuleBasedCollator rbc = (RuleBasedCollator) obj; - // FIXME: this is probably wrong. Instead we should compare maps - // directly. - return (frenchAccents == rbc.frenchAccents - && rules.equals(rbc.rules)); + } + + /** + * This method builds a default collation element without invoking + * the database created from the rules passed to the constructor. + * + * @param c Character which needs a collation element. + * @return A valid brand new CollationElement instance. + */ + CollationElement getDefaultElement(char c) + { + int v; + + // Preliminary support for generic accent sorting inversion (I don't know if all + // characters in the range should be sorted backward). This is the place + // to fix this if needed. + if (inverseAccentComparison && (c >= 0x02B9 && c <= 0x0361)) + v = 0x0361 - ((int) c - 0x02B9); + else + v = (short) c; + return new CollationElement("" + c, last_primary_value + v, + (short) 0, (short) 0, (short) 0, null, false); + } + + /** + * This method builds a default collation element for an accented character + * without invoking the database created from the rules passed to the constructor. + * + * @param c Character which needs a collation element. + * @return A valid brand new CollationElement instance. + */ + CollationElement getDefaultAccentedElement(char c) + { + int v; + + // Preliminary support for generic accent sorting inversion (I don't know if all + // characters in the range should be sorted backward). This is the place + // to fix this if needed. + if (inverseAccentComparison && (c >= 0x02B9 && c <= 0x0361)) + v = 0x0361 - ((int) c - 0x02B9); + else + v = (short) c; + return new CollationElement("" + c, (short) 0, + (short) 0, (short) (last_tertiary_value + v), (short) 0, null, false); } /** @@ -489,13 +908,7 @@ public class RuleBasedCollator extends Collator */ public CollationElementIterator getCollationElementIterator(String source) { - int len = source.length(); - StringBuffer expand = new StringBuffer(len); - - for (int index = 0; index < len; ++index) - decomposeCharacter(source.charAt(index), expand); - - return new CollationElementIterator(this, expand.toString()); + return new CollationElementIterator(this, source); } /** @@ -517,7 +930,7 @@ public class RuleBasedCollator extends Collator c = source.next()) decomposeCharacter(c, expand); - return new CollationElementIterator(this, expand.toString()); + return getCollationElementIterator(expand.toString()); } /** @@ -533,8 +946,52 @@ public class RuleBasedCollator extends Collator */ public CollationKey getCollationKey(String source) { - return new CollationKey(this, getCollationElementIterator(source), source, - strength); + CollationElementIterator cei = getCollationElementIterator(source); + Vector vect = new Vector(25); + + int ord = cei.next(); + cei.reset(); //set to start of string + + while (ord != CollationElementIterator.NULLORDER) + { + // If the primary order is null, it means this is an ignorable + // character. + if (CollationElementIterator.primaryOrder(ord) == 0) + { + ord = cei.next(); + continue; + } + switch (getStrength()) + { + case PRIMARY: + ord = CollationElementIterator.primaryOrder(ord); + break; + + case SECONDARY: + ord = CollationElementIterator.primaryOrder(ord) << 8; + ord |= CollationElementIterator.secondaryOrder(ord); + + default: + break; + } + + vect.add(new Integer(ord)); + ord = cei.next(); //increment to next key + } + + Object[] objarr = vect.toArray(); + byte[] key = new byte[objarr.length * 4]; + + for (int i = 0; i < objarr.length; i++) + { + int j = ((Integer) objarr[i]).intValue(); + key [i * 4] = (byte) ((j & 0xFF000000) >> 24); + key [i * 4 + 1] = (byte) ((j & 0x00FF0000) >> 16); + key [i * 4 + 2] = (byte) ((j & 0x0000FF00) >> 8); + key [i * 4 + 3] = (byte) (j & 0x000000FF); + } + + return new CollationKey(this, source, key); } /** @@ -555,44 +1012,6 @@ public class RuleBasedCollator extends Collator */ public int hashCode() { - return (frenchAccents ? 1231 : 1237 - ^ rules.hashCode() - ^ map.hashCode() - ^ prefixes.hashCode()); + return System.identityHashCode(this); } - - private final boolean is_special (char c) - { - // Rules from JCL book. - return ((c >= 0x0021 && c <= 0x002f) - || (c >= 0x003a && c <= 0x0040) - || (c >= 0x005b && c <= 0x0060) - || (c >= 0x007b && c <= 0x007e)); - } - - private final int text_argument (String rules, int index, - StringBuffer result) - { - result.setLength(0); - int len = rules.length(); - while (index < len) - { - char c = rules.charAt (index); - if (c == '\'' - && index + 2 < len - && rules.charAt (index + 2) == '\'') - { - result.append (rules.charAt (index + 1)); - index += 2; - } - else if (is_special (c)) - return index; - else if (!Character.isWhitespace (c)) - result.append (c); - - ++index; - } - return index; - } - } diff --git a/libjava/testsuite/libjava.mauve/xfails b/libjava/testsuite/libjava.mauve/xfails index 5f2f34a..37981be 100644 --- a/libjava/testsuite/libjava.mauve/xfails +++ b/libjava/testsuite/libjava.mauve/xfails @@ -35,106 +35,6 @@ FAIL: gnu.testlet.java.lang.String.getBytes14: String.getBytes("UTF-16LE") (numb FAIL: gnu.testlet.java.text.DateFormatSymbols.Test: patterns (number 2) FAIL: gnu.testlet.java.text.SimpleDateFormat.getAndSet2DigitYearStart: get2DigitYearStart() initial (number 1) FAIL: gnu.testlet.java.text.DateFormatSymbols.Test: invalid locale (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: CollationElementIterator (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: next() 0 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: next() 1 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: next() 10 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: next() 11 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: next() 12 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: next() 13 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: next() 2 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: next() 3 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: next() 4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: next() 5 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: next() 6 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: next() 7 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: next() 8 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: next() 9 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: primaryOrder() 0 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: primaryOrder() 1 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: primaryOrder() 10 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: primaryOrder() 11 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: primaryOrder() 12 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: primaryOrder() 13 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: primaryOrder() 2 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: primaryOrder() 3 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: primaryOrder() 4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: primaryOrder() 5 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: primaryOrder() 6 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: primaryOrder() 7 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: primaryOrder() 8 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: secondaryOrder() 12 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: secondaryOrder() 4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: secondaryOrder() 5 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: secondaryOrder() 7 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: secondaryOrder() 9 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: tertiaryOrder() 10 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: next() 2 test string #4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: no primary difference 2 test string #4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: next() 4 test string #4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: no primary difference 4 test string #4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: next() 6 test string #4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: no primary difference 6 test string #4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: next() 8 test string #4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: no primary difference 8 test string #4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: next() 10 test string #4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: no primary difference 10 test string #4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: next() 12 test string #4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: no primary difference 12 test string #4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: next() 14 test string #4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: no primary difference 14 test string #4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: next() 16 test string #4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: no primary difference 16 test string #4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: next() 18 test string #4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: no primary difference 18 test string #4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: next() 20 test string #4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: no primary difference 20 test string #4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: next() 22 test string #4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: no primary difference 22 test string #4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: next() 24 test string #4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: no primary difference 24 test string #4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: next() 26 test string #4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: no primary difference 26 test string #4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: next() 28 test string #4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: no primary difference 28 test string #4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: next() 30 test string #4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: no primary difference 30 test string #4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: next() 32 test string #4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: no primary difference 32 test string #4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: next() 34 test string #4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: no primary difference 34 test string #4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: next() 36 test string #4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: no primary difference 36 test string #4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: next() 38 test string #4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: no primary difference 38 test string #4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: next() 40 test string #4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: no primary difference 40 test string #4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: next() 42 test string #4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: no primary difference 42 test string #4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: next() 44 test string #4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: no primary difference 44 test string #4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: next() 46 test string #4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: no primary difference 46 test string #4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: next() 48 test string #4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: no primary difference 48 test string #4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: next() 50 test string #4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: no primary difference 50 test string #4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: next() 52 test string #4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: no primary difference 52 test string #4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: next() 54 test string #4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: no primary difference 54 test string #4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: next() 56 test string #4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: no primary difference 56 test string #4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: next() 58 test string #4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: no primary difference 58 test string #4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: next() 60 test string #4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: no primary difference 60 test string #4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: next() 62 test string #4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: no primary difference 62 test string #4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: wrong number of keys test string #4 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: next() 2 test string #5 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: no tertiary difference2 test string #5 (number 1) -FAIL: gnu.testlet.java.text.CollationElementIterator.jdk11: not tertiary ordered0, 2 test set #0 (number 1) FAIL: gnu.testlet.java.net.URLConnection.URLConnectionTest: Error in test_Basics - 2 should not have raised Throwable here (number 1) FAIL: gnu.testlet.java.net.URLConnection.URLConnectionTest: Error in test_getHeaderField - 2 4 header field wrong (number 1) FAIL: gnu.testlet.java.net.URL.URLTest: openStream (number 1) -- 2.7.4