blob: fe566555dd7c37fe04c00e4f5b714c430ec4285c [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package java.nio.charset;
import com.ibm.icu4jni.charset.NativeConverter;
import java.io.UnsupportedEncodingException;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.spi.CharsetProvider;
import java.security.AccessController;
import java.security.PrivilegedAction;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Locale;
import java.util.ServiceLoader;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;
/**
* A charset is a named mapping between Unicode characters and byte sequences. Every
* {@code Charset} can <i>decode</i>, converting a byte sequence into a sequence of characters,
* and some can also <i>encode</i>, converting a sequence of characters into a byte sequence.
* Use the method {@link #canEncode} to find out whether a charset supports both.
*
* <h4>Characters</h4>
* <p>In the context of this class, <i>character</i> always refers to a Java character: a Unicode
* code point in the range U+0000 to U+FFFF. (Java represents supplementary characters using surrogates.)
* Not all byte sequences will represent a character, and not
* all characters can necessarily be represented by a given charset. The method {@link #contains}
* can be used to determine whether every character representable by one charset can also be
* represented by another (meaning that a lossless transformation is possible from the contained
* to the container).
*
* <h4>Encodings</h4>
* <p>There are many possible ways to represent Unicode characters as byte sequences.
* See <a href="http://www.unicode.org/reports/tr17/">UTR#17: Unicode Character Encoding Model</a>
* for detailed discussion.
*
* <p>The most important mappings capable of representing every character are the Unicode
* Transformation Format (UTF) charsets. Of those, UTF-8 and the UTF-16 family are the most
* common. UTF-8 (described in <a href="http://www.ietf.org/rfc/rfc3629.txt">RFC 3629</a>)
* encodes a character using 1 to 4 bytes. UTF-16 uses exactly 2 bytes per character (potentially
* wasting space, but allowing efficient random access into BMP text), and UTF-32 uses
* exactly 4 bytes per character (trading off even more space for efficient random access into text
* that includes supplementary characters).
*
* <p>UTF-16 and UTF-32 encode characters directly, using their code point as a two- or four-byte
* integer. This means that any given UTF-16 or UTF-32 byte sequence is either big- or
* little-endian. To assist decoders, Unicode includes a special <i>byte order mark</i> (BOM)
* character U+FEFF used to determine the endianness of a sequence. The corresponding byte-swapped
* code point U+FFFE is guaranteed never to be assigned. If a UTF-16 decoder sees
* {@code 0xfe, 0xff}, for example, it knows it's reading a big-endian byte sequence, while
* {@code 0xff, 0xfe}, would indicate a little-endian byte sequence.
*
* <p>UTF-8 can contain a BOM, but since the UTF-8 encoding of a character always uses the same
* byte sequence, there is no information about endianness to convey. Seeing the bytes
* corresponding to the UTF-8 encoding of U+FEFF ({@code 0xef, 0xbb, 0xbf}) would only serve to
* suggest that you're reading UTF-8. Note that BOMs are decoded as the U+FEFF character, and
* will appear in the output character sequence. This means that a disadvantage to including a BOM
* in UTF-8 is that most applications that use UTF-8 do not expect to see a BOM. (This is also a
* reason to prefer UTF-8: it's one less complication to worry about.)
*
* <p>Because a BOM indicates how the data that follows should be interpreted, a BOM should occur
* as the first character in a character sequence.
*
* <p>See the <a href="http://unicode.org/faq/utf_bom.html#BOM">Byte Order Mark (BOM) FAQ</a> for
* more about dealing with BOMs.
*
* <h4>Endianness and BOM behavior</h4>
*
* <p>The following tables show the endianness and BOM behavior of the UTF-16 variants.
*
* <p>This table shows what the encoder writes. "BE" means that the byte sequence is big-endian,
* "LE" means little-endian. "BE BOM" means a big-endian BOM (that is, {@code 0xfe, 0xff}).
* <p><table width="100%">
* <tr> <th>Charset</th> <th>Encoder writes</th> </tr>
* <tr> <td>UTF-16BE</td> <td>BE, no BOM</td> </tr>
* <tr> <td>UTF-16LE</td> <td>LE, no BOM</td> </tr>
* <tr> <td>UTF-16</td> <td>BE, with BE BOM</td> </tr>
* </table>
*
* <p>The next table shows how each variant's decoder behaves when reading a byte sequence.
* The exact meaning of "failure" in the table is dependent on the
* {@link CodingErrorAction} supplied to {@link CharsetDecoder#malformedInputAction}, so
* "BE, failure" means "the byte sequence is treated as big-endian, and a little-endian BOM
* triggers the malformedInputAction".
*
* <p>The phrase "includes BOM" means that the output includes the U+FEFF byte order mark character.
*
* <p><table width="100%">
* <tr> <th>Charset</th> <th>BE BOM</th> <th>LE BOM</th> <th>No BOM</th> </tr>
* <tr> <td>UTF-16BE</td> <td>BE, includes BOM</td> <td>BE, failure</td> <td>BE</td> </tr>
* <tr> <td>UTF-16LE</td> <td>LE, failure</td> <td>LE, includes BOM</td> <td>LE</td> </tr>
* <tr> <td>UTF-16</td> <td>BE</td> <td>LE</td> <td>BE</td> </tr>
* </table>
*
* <h4>Charset names</h4>
* <p>A charset has a canonical name, returned by {@link #name}. Most charsets will
* also have one or more aliases, returned by {@link #aliases}. A charset can be looked up
* by canonical name or any of its aliases using {@link #forName}.
*
* <h4>Guaranteed-available charsets</h4>
* <p>The following charsets are available on every Java implementation:
* <ul>
* <li>ISO-8859-1
* <li>US-ASCII
* <li>UTF-16
* <li>UTF-16BE
* <li>UTF-16LE
* <li>UTF-8
* </ul>
* <p>All of these charsets support both decoding and encoding. The charsets whose names begin
* "UTF" can represent all characters, as mentioned above. The "ISO-8859-1" and "US-ASCII" charsets
* can only represent small subsets of these characters. Except when required to do otherwise for
* compatibility, new code should use one of the UTF charsets listed above. The platform's default
* charset is UTF-8. (This is in contrast to some older implementations, where the default charset
* depended on the user's locale.)
*
* <p>Most implementations will support hundreds of charsets. Use {@link #availableCharsets} or
* {@link #isSupported} to see what's available. If you intend to use the charset if it's
* available, just call {@link #forName} and catch the exceptions it throws if the charset isn't
* available.
*
* <p>Additional charsets can be made available by configuring one or more charset
* providers through provider configuration files. Such files are always named
* as "java.nio.charset.spi.CharsetProvider" and located in the
* "META-INF/services" directory of one or more classpaths. The files should be
* encoded in "UTF-8". Each line of their content specifies the class name of a
* charset provider which extends {@link java.nio.charset.spi.CharsetProvider}.
* A line should end with '\r', '\n' or '\r\n'. Leading and trailing whitespace
* is trimmed. Blank lines, and lines (after trimming) starting with "#" which are
* regarded as comments, are both ignored. Duplicates of names already found are also
* ignored. Both the configuration files and the provider classes will be loaded
* using the thread context class loader.
*
* <p>Although class is thread-safe, the {@link CharsetDecoder} and {@link CharsetEncoder} instances
* it returns are inherently stateful.
*/
public abstract class Charset implements Comparable<Charset> {
private static final HashMap<String, Charset> CACHED_CHARSETS = new HashMap<String, Charset>();
private static final Charset DEFAULT_CHARSET = getDefaultCharset();
private final String canonicalName;
private final HashSet<String> aliasesSet;
/**
* Constructs a <code>Charset</code> object. Duplicated aliases are
* ignored.
*
* @param canonicalName
* the canonical name of the charset.
* @param aliases
* an array containing all aliases of the charset. May be null.
* @throws IllegalCharsetNameException
* on an illegal value being supplied for either
* <code>canonicalName</code> or for any element of
* <code>aliases</code>.
*/
protected Charset(String canonicalName, String[] aliases) {
// check whether the given canonical name is legal
checkCharsetName(canonicalName);
this.canonicalName = canonicalName;
// check each alias and put into a set
this.aliasesSet = new HashSet<String>();
if (aliases != null) {
for (String alias : aliases) {
checkCharsetName(alias);
this.aliasesSet.add(alias);
}
}
}
private static void checkCharsetName(String name) {
if (name.isEmpty()) {
throw new IllegalCharsetNameException(name);
}
int length = name.length();
for (int i = 0; i < length; ++i) {
if (!isValidCharsetNameCharacter(name.charAt(i))) {
throw new IllegalCharsetNameException(name);
}
}
}
private static boolean isValidCharsetNameCharacter(char c) {
return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') ||
c == '-' || c == '.' || c == ':' || c == '_';
}
/**
* Returns an immutable case-insensitive map from canonical names to {@code Charset} instances.
* If multiple charsets have the same canonical name, it is unspecified which is returned in
* the map. This method may be slow. If you know which charset you're looking for, use
* {@link #forName}.
* @return an immutable case-insensitive map from canonical names to {@code Charset} instances
*/
public static SortedMap<String, Charset> availableCharsets() {
// Start with a copy of the built-in charsets...
TreeMap<String, Charset> charsets = new TreeMap<String, Charset>(String.CASE_INSENSITIVE_ORDER);
for (String charsetName : NativeConverter.getAvailableCharsetNames()) {
Charset charset = NativeConverter.charsetForName(charsetName);
charsets.put(charset.name(), charset);
}
// Add all charsets provided by all charset providers...
for (CharsetProvider charsetProvider : ServiceLoader.load(CharsetProvider.class, null)) {
Iterator<Charset> it = charsetProvider.charsets();
while (it.hasNext()) {
Charset cs = it.next();
// A CharsetProvider can't override a built-in Charset.
if (!charsets.containsKey(cs.name())) {
charsets.put(cs.name(), cs);
}
}
}
return Collections.unmodifiableSortedMap(charsets);
}
private static Charset cacheCharset(String charsetName, Charset cs) {
synchronized (CACHED_CHARSETS) {
// Get the canonical name for this charset, and the canonical instance from the table.
String canonicalName = cs.name();
Charset canonicalCharset = CACHED_CHARSETS.get(canonicalName);
if (canonicalCharset == null) {
canonicalCharset = cs;
}
// Cache the charset by its canonical name...
CACHED_CHARSETS.put(canonicalName, canonicalCharset);
// And the name the user used... (Section 1.4 of http://unicode.org/reports/tr22/ means
// that many non-alias, non-canonical names are valid. For example, "utf8" isn't an
// alias of the canonical name "UTF-8", but we shouldn't penalize consistent users of
// such names unduly.)
CACHED_CHARSETS.put(charsetName, canonicalCharset);
// And all its aliases...
for (String alias : cs.aliasesSet) {
CACHED_CHARSETS.put(alias, canonicalCharset);
}
return canonicalCharset;
}
}
/**
* Returns a {@code Charset} instance for the named charset.
*
* @param charsetName a charset name (either canonical or an alias)
* @throws IllegalCharsetNameException
* if the specified charset name is illegal.
* @throws UnsupportedCharsetException
* if the desired charset is not supported by this runtime.
*/
public static Charset forName(String charsetName) {
// Is this charset in our cache?
Charset cs;
synchronized (CACHED_CHARSETS) {
cs = CACHED_CHARSETS.get(charsetName);
if (cs != null) {
return cs;
}
}
// Is this a built-in charset supported by ICU?
if (charsetName == null) {
throw new IllegalCharsetNameException(charsetName);
}
checkCharsetName(charsetName);
cs = NativeConverter.charsetForName(charsetName);
if (cs != null) {
return cacheCharset(charsetName, cs);
}
// Does a configured CharsetProvider have this charset?
for (CharsetProvider charsetProvider : ServiceLoader.load(CharsetProvider.class, null)) {
cs = charsetProvider.charsetForName(charsetName);
if (cs != null) {
return cacheCharset(charsetName, cs);
}
}
throw new UnsupportedCharsetException(charsetName);
}
/**
* Equivalent to {@code forName} but only throws {@code UnsupportedEncodingException},
* which is all pre-nio code claims to throw.
*
* @hide
*/
public static Charset forNameUEE(String charsetName) throws UnsupportedEncodingException {
try {
return Charset.forName(charsetName);
} catch (Exception cause) {
UnsupportedEncodingException ex = new UnsupportedEncodingException(charsetName);
ex.initCause(cause);
throw ex;
}
}
/**
* Determines whether the specified charset is supported by this runtime.
*
* @param charsetName
* the name of the charset.
* @return true if the specified charset is supported, otherwise false.
* @throws IllegalCharsetNameException
* if the specified charset name is illegal.
*/
public static boolean isSupported(String charsetName) {
try {
Charset cs = forName(charsetName);
return true;
} catch (UnsupportedCharsetException ex) {
return false;
}
}
/**
* Determines whether this charset is a superset of the given charset. A charset C1 contains
* charset C2 if every character representable by C2 is also representable by C1. This means
* that lossless conversion is possible from C2 to C1 (but not necessarily the other way
* round). It does <i>not</i> imply that the two charsets use the same byte sequences for the
* characters they share.
*
* <p>Note that this method is allowed to be conservative, and some implementations may return
* false when this charset does contain the other charset. Android's implementation is precise,
* and will always return true in such cases.
*
* @param charset
* a given charset.
* @return true if this charset is a super set of the given charset,
* false if it's unknown or this charset is not a superset of
* the given charset.
*/
public abstract boolean contains(Charset charset);
/**
* Gets a new instance of an encoder for this charset.
*
* @return a new instance of an encoder for this charset.
*/
public abstract CharsetEncoder newEncoder();
/**
* Gets a new instance of a decoder for this charset.
*
* @return a new instance of a decoder for this charset.
*/
public abstract CharsetDecoder newDecoder();
/**
* Gets the canonical name of this charset.
*
* @return this charset's name in canonical form.
*/
public final String name() {
return this.canonicalName;
}
/**
* Gets the set of this charset's aliases.
*
* @return an unmodifiable set of this charset's aliases.
*/
public final Set<String> aliases() {
return Collections.unmodifiableSet(this.aliasesSet);
}
/**
* Gets the name of this charset for the default locale.
*
* <p>The default implementation returns the canonical name of this charset.
* Subclasses may return a localized display name.
*
* @return the name of this charset for the default locale.
*/
public String displayName() {
return this.canonicalName;
}
/**
* Gets the name of this charset for the specified locale.
*
* <p>The default implementation returns the canonical name of this charset.
* Subclasses may return a localized display name.
*
* @param l
* a certain locale
* @return the name of this charset for the specified locale
*/
public String displayName(Locale l) {
return this.canonicalName;
}
/**
* Indicates whether this charset is known to be registered in the IANA
* Charset Registry.
*
* @return true if the charset is known to be registered, otherwise returns
* false.
*/
public final boolean isRegistered() {
return !canonicalName.startsWith("x-") && !canonicalName.startsWith("X-");
}
/**
* Returns true if this charset supports encoding, false otherwise.
*
* @return true if this charset supports encoding, false otherwise.
*/
public boolean canEncode() {
return true;
}
/**
* Returns a new {@code ByteBuffer} containing the bytes encoding the characters from
* {@code buffer}.
* This method uses {@code CodingErrorAction.REPLACE}.
*
* <p>Applications should generally create a {@link CharsetEncoder} using {@link #newEncoder}
* for performance.
*
* @param buffer
* the character buffer containing the content to be encoded.
* @return the result of the encoding.
*/
public final ByteBuffer encode(CharBuffer buffer) {
try {
return newEncoder()
.onMalformedInput(CodingErrorAction.REPLACE)
.onUnmappableCharacter(CodingErrorAction.REPLACE).encode(
buffer);
} catch (CharacterCodingException ex) {
throw new Error(ex.getMessage(), ex);
}
}
/**
* Returns a new {@code ByteBuffer} containing the bytes encoding the characters from {@code s}.
* This method uses {@code CodingErrorAction.REPLACE}.
*
* <p>Applications should generally create a {@link CharsetEncoder} using {@link #newEncoder}
* for performance.
*
* @param s the string to be encoded.
* @return the result of the encoding.
*/
public final ByteBuffer encode(String s) {
return encode(CharBuffer.wrap(s));
}
/**
* Returns a new {@code CharBuffer} containing the characters decoded from {@code buffer}.
* This method uses {@code CodingErrorAction.REPLACE}.
*
* <p>Applications should generally create a {@link CharsetDecoder} using {@link #newDecoder}
* for performance.
*
* @param buffer
* the byte buffer containing the content to be decoded.
* @return a character buffer containing the output of the decoding.
*/
public final CharBuffer decode(ByteBuffer buffer) {
try {
return newDecoder()
.onMalformedInput(CodingErrorAction.REPLACE)
.onUnmappableCharacter(CodingErrorAction.REPLACE).decode(buffer);
} catch (CharacterCodingException ex) {
throw new Error(ex.getMessage(), ex);
}
}
/*
* -------------------------------------------------------------------
* Methods implementing parent interface Comparable
* -------------------------------------------------------------------
*/
/**
* Compares this charset with the given charset. This comparison is
* based on the case insensitive canonical names of the charsets.
*
* @param charset
* the given object to be compared with.
* @return a negative integer if less than the given object, a positive
* integer if larger than it, or 0 if equal to it.
*/
public final int compareTo(Charset charset) {
return this.canonicalName.compareToIgnoreCase(charset.canonicalName);
}
/*
* -------------------------------------------------------------------
* Methods overriding parent class Object
* -------------------------------------------------------------------
*/
/**
* Determines whether this charset equals to the given object. They are
* considered to be equal if they have the same canonical name.
*
* @param obj
* the given object to be compared with.
* @return true if they have the same canonical name, otherwise false.
*/
@Override
public final boolean equals(Object obj) {
if (obj instanceof Charset) {
Charset that = (Charset) obj;
return this.canonicalName.equals(that.canonicalName);
}
return false;
}
/**
* Gets the hash code of this charset.
*
* @return the hash code of this charset.
*/
@Override
public final int hashCode() {
return this.canonicalName.hashCode();
}
/**
* Gets a string representation of this charset. Usually this contains the
* canonical name of the charset.
*
* @return a string representation of this charset.
*/
@Override
public final String toString() {
return getClass().getName() + "[" + this.canonicalName + "]";
}
/**
* Returns the system's default charset. This is determined during VM startup, and will not
* change thereafter. On Android, the default charset is UTF-8.
*/
public static Charset defaultCharset() {
return DEFAULT_CHARSET;
}
private static Charset getDefaultCharset() {
String encoding = AccessController.doPrivileged(new PrivilegedAction<String>() {
public String run() {
return System.getProperty("file.encoding", "UTF-8");
}
});
try {
return Charset.forName(encoding);
} catch (UnsupportedCharsetException e) {
return Charset.forName("UTF-8");
}
}
}