blob: 60465d425fc9ec3969d86cb81bdb1ff088ec22ea [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Id: EncodingInfo.java 468654 2006-10-28 07:09:23Z minchau $
*/
package org.apache.xml.serializer;
/**
* Holds information about a given encoding, which is the Java name for the
* encoding, the equivalent ISO name.
* <p>
* An object of this type has two useful methods
* <pre>
* isInEncoding(char ch);
* </pre>
* which can be called if the character is not the high one in
* a surrogate pair and:
* <pre>
* isInEncoding(char high, char low);
* </pre>
* which can be called if the two characters from a high/low surrogate pair.
* <p>
* An EncodingInfo object is a node in a binary search tree. Such a node
* will answer if a character is in the encoding, and do so for a given
* range of unicode values (<code>m_first</code> to
* <code>m_last</code>). It will handle a certain range of values
* explicitly (<code>m_explFirst</code> to <code>m_explLast</code>).
* If the unicode point is before that explicit range, that is it
* is in the range <code>m_first <= value < m_explFirst</code>, then it will delegate to another EncodingInfo object for The root
* of such a tree, m_before. Likewise for values in the range
* <code>m_explLast < value <= m_last</code>, but delgating to <code>m_after</code>
* <p>
* Actually figuring out if a code point is in the encoding is expensive. So the
* purpose of this tree is to cache such determinations, and not to build the
* entire tree of information at the start, but only build up as much of the
* tree as is used during the transformation.
* <p>
* This Class is not a public API, and should only be used internally within
* the serializer.
* <p>
* This class is not a public API.
* @xsl.usage internal
*/
public final class EncodingInfo extends Object
{
/**
* Not all characters in an encoding are in on contiguous group,
* however there is a lowest contiguous group starting at '\u0001'
* and working up to m_highCharInContiguousGroup.
* <p>
* This is the char for which chars at or below this value are
* definately in the encoding, although for chars
* above this point they might be in the encoding.
* This exists for performance, especially for ASCII characters
* because for ASCII all chars in the range '\u0001' to '\u007F'
* are in the encoding.
*
*/
private final char m_highCharInContiguousGroup;
/**
* The ISO encoding name.
*/
final String name;
/**
* The name used by the Java convertor.
*/
final String javaName;
/**
* A helper object that we can ask if a
* single char, or a surrogate UTF-16 pair
* of chars that form a single character,
* is in this encoding.
*/
private InEncoding m_encoding;
/**
* This is not a public API. It returns true if the
* char in question is in the encoding.
* @param ch the char in question.
* <p>
* This method is not a public API.
* @xsl.usage internal
*/
public boolean isInEncoding(char ch) {
if (m_encoding == null) {
m_encoding = new EncodingImpl();
// One could put alternate logic in here to
// instantiate another object that implements the
// InEncoding interface. For example if the JRE is 1.4 or up
// we could have an object that uses JRE 1.4 methods
}
return m_encoding.isInEncoding(ch);
}
/**
* This is not a public API. It returns true if the
* character formed by the high/low pair is in the encoding.
* @param high a char that the a high char of a high/low surrogate pair.
* @param low a char that is the low char of a high/low surrogate pair.
* <p>
* This method is not a public API.
* @xsl.usage internal
*/
public boolean isInEncoding(char high, char low) {
if (m_encoding == null) {
m_encoding = new EncodingImpl();
// One could put alternate logic in here to
// instantiate another object that implements the
// InEncoding interface. For example if the JRE is 1.4 or up
// we could have an object that uses JRE 1.4 methods
}
return m_encoding.isInEncoding(high, low);
}
/**
* Create an EncodingInfo object based on the ISO name and Java name.
* If both parameters are null any character will be considered to
* be in the encoding. This is useful for when the serializer is in
* temporary output state, and has no assciated encoding.
*
* @param name reference to the ISO name.
* @param javaName reference to the Java encoding name.
* @param highChar The char for which characters at or below this value are
* definately in the
* encoding, although for characters above this point they might be in the encoding.
*/
public EncodingInfo(String name, String javaName, char highChar)
{
this.name = name;
this.javaName = javaName;
this.m_highCharInContiguousGroup = highChar;
}
/**
* A simple interface to isolate the implementation.
* We could also use some new JRE 1.4 methods in another implementation
* provided we use reflection with them.
* <p>
* This interface is not a public API,
* and should only be used internally within the serializer.
* @xsl.usage internal
*/
private interface InEncoding {
/**
* Returns true if the char is in the encoding
*/
public boolean isInEncoding(char ch);
/**
* Returns true if the high/low surrogate pair forms
* a character that is in the encoding.
*/
public boolean isInEncoding(char high, char low);
}
/**
* This class implements the
*/
private class EncodingImpl implements InEncoding {
public boolean isInEncoding(char ch1) {
final boolean ret;
int codePoint = Encodings.toCodePoint(ch1);
if (codePoint < m_explFirst) {
// The unicode value is before the range
// that we explictly manage, so we delegate the answer.
// If we don't have an m_before object to delegate to, make one.
if (m_before == null)
m_before =
new EncodingImpl(
m_encoding,
m_first,
m_explFirst - 1,
codePoint);
ret = m_before.isInEncoding(ch1);
} else if (m_explLast < codePoint) {
// The unicode value is after the range
// that we explictly manage, so we delegate the answer.
// If we don't have an m_after object to delegate to, make one.
if (m_after == null)
m_after =
new EncodingImpl(
m_encoding,
m_explLast + 1,
m_last,
codePoint);
ret = m_after.isInEncoding(ch1);
} else {
// The unicode value is in the range we explitly handle
final int idx = codePoint - m_explFirst;
// If we already know the answer, just return it.
if (m_alreadyKnown[idx])
ret = m_isInEncoding[idx];
else {
// We don't know the answer, so find out,
// which may be expensive, then cache the answer
ret = inEncoding(ch1, m_encoding);
m_alreadyKnown[idx] = true;
m_isInEncoding[idx] = ret;
}
}
return ret;
}
public boolean isInEncoding(char high, char low) {
final boolean ret;
int codePoint = Encodings.toCodePoint(high,low);
if (codePoint < m_explFirst) {
// The unicode value is before the range
// that we explictly manage, so we delegate the answer.
// If we don't have an m_before object to delegate to, make one.
if (m_before == null)
m_before =
new EncodingImpl(
m_encoding,
m_first,
m_explFirst - 1,
codePoint);
ret = m_before.isInEncoding(high,low);
} else if (m_explLast < codePoint) {
// The unicode value is after the range
// that we explictly manage, so we delegate the answer.
// If we don't have an m_after object to delegate to, make one.
if (m_after == null)
m_after =
new EncodingImpl(
m_encoding,
m_explLast + 1,
m_last,
codePoint);
ret = m_after.isInEncoding(high,low);
} else {
// The unicode value is in the range we explitly handle
final int idx = codePoint - m_explFirst;
// If we already know the answer, just return it.
if (m_alreadyKnown[idx])
ret = m_isInEncoding[idx];
else {
// We don't know the answer, so find out,
// which may be expensive, then cache the answer
ret = inEncoding(high, low, m_encoding);
m_alreadyKnown[idx] = true;
m_isInEncoding[idx] = ret;
}
}
return ret;
}
/**
* The encoding.
*/
final private String m_encoding;
/**
* m_first through m_last is the range of unicode
* values that this object will return an answer on.
* It may delegate to a similar object with a different
* range
*/
final private int m_first;
/**
* m_explFirst through m_explLast is the range of unicode
* value that this object handles explicitly and does not
* delegate to a similar object.
*/
final private int m_explFirst;
final private int m_explLast;
final private int m_last;
/**
* The object, of the same type as this one,
* that handles unicode values in a range before
* the range explictly handled by this object, and
* to which this object may delegate.
*/
private InEncoding m_before;
/**
* The object, of the same type as this one,
* that handles unicode values in a range after
* the range explictly handled by this object, and
* to which this object may delegate.
*/
private InEncoding m_after;
/**
* The number of unicode values explicitly handled
* by a single EncodingInfo object. This value is
* tuneable, but is set to 128 because that covers the
* entire low range of ASCII type chars within a single
* object.
*/
private static final int RANGE = 128;
/**
* A flag to record if we already know the answer
* for the given unicode value.
*/
final private boolean m_alreadyKnown[] = new boolean[RANGE];
/**
* A table holding the answer on whether the given unicode
* value is in the encoding.
*/
final private boolean m_isInEncoding[] = new boolean[RANGE];
private EncodingImpl() {
// This object will answer whether any unicode value
// is in the encoding, it handles values 0 through Integer.MAX_VALUE
this(javaName, 0, Integer.MAX_VALUE, (char) 0);
}
private EncodingImpl(String encoding, int first, int last, int codePoint) {
// Set the range of unicode values that this object manages
// either explicitly or implicitly.
m_first = first;
m_last = last;
// Set the range of unicode values that this object
// explicitly manages
m_explFirst = codePoint;
m_explLast = codePoint + (RANGE-1);
m_encoding = encoding;
if (javaName != null)
{
// Some optimization.
if (0 <= m_explFirst && m_explFirst <= 127) {
// This particular EncodingImpl explicitly handles
// characters in the low range.
if ("UTF8".equals(javaName)
|| "UTF-16".equals(javaName)
|| "ASCII".equals(javaName)
|| "US-ASCII".equals(javaName)
|| "Unicode".equals(javaName)
|| "UNICODE".equals(javaName)
|| javaName.startsWith("ISO8859")) {
// Not only does this EncodingImpl object explicitly
// handle chracters in the low range, it is
// also one that we know something about, without
// needing to call inEncoding(char ch, String encoding)
// for this low range
//
// By initializing the table ahead of time
// for these low values, we prevent the expensive
// inEncoding(char ch, String encoding)
// from being called, at least for these common
// encodings.
for (int unicode = 1; unicode < 127; unicode++) {
final int idx = unicode - m_explFirst;
if (0 <= idx && idx < RANGE) {
m_alreadyKnown[idx] = true;
m_isInEncoding[idx] = true;
}
}
}
}
/* A little bit more than optimization.
*
* We will say that any character is in the encoding if
* we don't have an encoding.
* This is meaningful when the serializer is being used
* in temporary output state, where we are not writing to
* the final output tree. It is when writing to the
* final output tree that we need to worry about the output
* encoding
*/
if (javaName == null) {
for (int idx = 0; idx < m_alreadyKnown.length; idx++) {
m_alreadyKnown[idx] = true;
m_isInEncoding[idx] = true;
}
}
}
}
}
/**
* This is heart of the code that determines if a given character
* is in the given encoding. This method is probably expensive,
* and the answer should be cached.
* <p>
* This method is not a public API,
* and should only be used internally within the serializer.
* @param ch the char in question, that is not a high char of
* a high/low surrogate pair.
* @param encoding the Java name of the enocding.
*
* @xsl.usage internal
*
*/
private static boolean inEncoding(char ch, String encoding) {
boolean isInEncoding;
try {
char cArray[] = new char[1];
cArray[0] = ch;
// Construct a String from the char
String s = new String(cArray);
// Encode the String into a sequence of bytes
// using the given, named charset.
byte[] bArray = s.getBytes(encoding);
isInEncoding = inEncoding(ch, bArray);
} catch (Exception e) {
isInEncoding = false;
// If for some reason the encoding is null, e.g.
// for a temporary result tree, we should just
// say that every character is in the encoding.
if (encoding == null)
isInEncoding = true;
}
return isInEncoding;
}
/**
* This is heart of the code that determines if a given high/low
* surrogate pair forms a character that is in the given encoding.
* This method is probably expensive, and the answer should be cached.
* <p>
* This method is not a public API,
* and should only be used internally within the serializer.
* @param high the high char of
* a high/low surrogate pair.
* @param low the low char of a high/low surrogate pair.
* @param encoding the Java name of the encoding.
*
* @xsl.usage internal
*
*/
private static boolean inEncoding(char high, char low, String encoding) {
boolean isInEncoding;
try {
char cArray[] = new char[2];
cArray[0] = high;
cArray[1] = low;
// Construct a String from the char
String s = new String(cArray);
// Encode the String into a sequence of bytes
// using the given, named charset.
byte[] bArray = s.getBytes(encoding);
isInEncoding = inEncoding(high,bArray);
} catch (Exception e) {
isInEncoding = false;
}
return isInEncoding;
}
/**
* This method is the core of determining if character
* is in the encoding. The method is not foolproof, because
* s.getBytes(encoding) has specified behavior only if the
* characters are in the specified encoding. However this
* method tries it's best.
* @param ch the char that was converted using getBytes, or
* the first char of a high/low pair that was converted.
* @param data the bytes written out by the call to s.getBytes(encoding);
* @return true if the character is in the encoding.
*/
private static boolean inEncoding(char ch, byte[] data) {
final boolean isInEncoding;
// If the string written out as data is not in the encoding,
// the output is not specified according to the documentation
// on the String.getBytes(encoding) method,
// but we do our best here.
if (data==null || data.length == 0) {
isInEncoding = false;
}
else {
if (data[0] == 0)
isInEncoding = false;
else if (data[0] == '?' && ch != '?')
isInEncoding = false;
/*
* else if (isJapanese) {
* // isJapanese is really
* // ( "EUC-JP".equals(javaName)
* // || "EUC_JP".equals(javaName)
* // || "SJIS".equals(javaName) )
*
* // Work around some bugs in JRE for Japanese
* if(data[0] == 0x21)
* isInEncoding = false;
* else if (ch == 0xA5)
* isInEncoding = false;
* else
* isInEncoding = true;
* }
*/
else {
// We don't know for sure, but it looks like it is in the encoding
isInEncoding = true;
}
}
return isInEncoding;
}
/**
* This method exists for performance reasons.
* <p>
* Except for '\u0000', if a char is less than or equal to the value
* returned by this method then it in the encoding.
* <p>
* The characters in an encoding are not contiguous, however
* there is a lowest group of chars starting at '\u0001' upto and
* including the char returned by this method that are all in the encoding.
* So the char returned by this method essentially defines the lowest
* contiguous group.
* <p>
* chars above the value returned might be in the encoding, but
* chars at or below the value returned are definately in the encoding.
* <p>
* In any case however, the isInEncoding(char) method can be used
* regardless of the value of the char returned by this method.
* <p>
* If the value returned is '\u0000' it means that every character must be tested
* with an isInEncoding method {@link #isInEncoding(char)} or {@link #isInEncoding(char, char)}
* for surrogate pairs.
* <p>
* This method is not a public API.
* @xsl.usage internal
*/
public final char getHighChar() {
return m_highCharInContiguousGroup;
}
}