CharsetConverter.java #1

package com.perforce.p4java;

import com.perforce.p4java.exception.ClientError;

import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CodingErrorAction;

/**
 * P4Java charset converter class
 */
public class CharsetConverter {

	private CharsetDecoder decoder;
	private CharsetEncoder encoder;

	private byte[] underflow;
	private boolean checkBOM = false;
	private boolean ignoreBOM = false;

	/**
	 * Creates a new charset converted that decodes/encodes bytes in the
	 * specified non-null from/to charset objects specified.
	 * 
	 * @param fromCharset
	 * @param toCharset
	 * @param ignoreBOM
	 *            - true to ignore any byte order marks written by the UTF-16
	 *            charset and omit them from all return byte buffers
	 */
	public CharsetConverter(Charset fromCharset, Charset toCharset,
			boolean ignoreBOM) {
		// Create decoder that reports malformed/unmappable values
		this.decoder = fromCharset.newDecoder();
		this.decoder.onMalformedInput(CodingErrorAction.REPORT);
		this.decoder.onUnmappableCharacter(CodingErrorAction.REPORT);

		// Create encoder that reports malformed/unmappable values
		this.encoder = toCharset.newEncoder();
		this.encoder.onMalformedInput(CodingErrorAction.REPORT);
		this.encoder.onUnmappableCharacter(CodingErrorAction.REPORT);

		// Check bom on UTF-16 since Java writes a BOM on each call to encode
		if ("UTF-16".equals(toCharset.name())) {
			checkBOM = true;
		}
		this.ignoreBOM = ignoreBOM;
	}

	/**
	 * Get charset name of from charset used to decode
	 * 
	 * @return - charset name
	 */
	public String getFromCharsetName() {
		return this.decoder.charset().name();
	}

	/**
	 * Get charset name of to charset used to encode
	 * 
	 * @return - charset name
	 */
	public String getToCharsetName() {
		return this.encoder.charset().name();
	}

	/**
	 * Creates a new charset converted that decodes/encodes bytes in the
	 * specified non-null from/to charset objects specified.
	 * 
	 * @param fromCharset
	 * @param toCharset
	 */
	public CharsetConverter(Charset fromCharset, Charset toCharset) {
		this(fromCharset, toCharset, false);
	}

	/**
	 * Get and clear the current converted underflow byte array. This results of
	 * this method should be wrapped in a {@link ByteBuffer} and specified as
	 * the from buffer on a call to {@link #convert(ByteBuffer)} to try convert
	 * any remaining bytes.
	 * 
	 * @return - byte array of underflow or null if the last call to
	 *         {@link #convert(ByteBuffer)} did not have underflow.
	 */
	public byte[] clearUnderflow() {
		byte[] cleared = this.underflow;
		this.underflow = null;
		return cleared;
	}

	/**
	 * Converts a char buffer to a byte buffer using the toCharset. This ignores
	 * any existing underflow since the characters to convert are already
	 * complete and known.
	 * 
	 * @param from
	 * @return - byte buffer, use {@link ByteBuffer#position()} for starting
	 *         array offset, {@link ByteBuffer#limit()} for number of bytes to
	 *         read, and {@link ByteBuffer#array()} for the byte[] itself.
	 */
	public ByteBuffer convert(CharBuffer from) {
		ByteBuffer converted = null;
		try {

			// Encode back to byte buffer
			converted = encoder.encode(from);

			if (checkBOM) {
				// Ignore BOM if UTF-16 and not first call to convert
				if (ignoreBOM) {
					int limit = converted.limit();
					if (limit > 2) {
						byte[] bom = new byte[2];
						converted.get(bom);
						// byte value of -2 == 0xFE
						// byte value of -1 == 0xFF
						// Big Endian BOM = FEFF
						// Little Endiam BOM = FFFE
						if ((bom[0] == -2 && bom[1] == -1)
								|| (bom[0] == -1 && bom[1] == -2)) {
							// Advance past BOM if detected
							converted.position(2);
							converted.limit(limit - 2);
						} else {
							// Rewind buffer if BOM not found
							converted.rewind();
						}
					}
				} else {
					ignoreBOM = true;
				}
			} else {
				converted.position(0);
			}
		} catch (CharacterCodingException cce) {
			Log.exception(cce);
			throw new ClientError("Translation of file content failed", cce);
		}
		return converted;
	}

	/**
	 * Convert a byte buffer by decoding using the fromCharset and encoding
	 * using the toCharset. The byte buffer returned will have its position be
	 * the array offset to use and the limit be the lenght of bytes to read from
	 * the byte buffer's backing array.
	 * 
	 * Any remaining bytes that couldn't be converted are stored locally until
	 * the next call to {@link #convert(ByteBuffer)}. The from buffer specified
	 * will be joined with the underflow from a previous call on subsequent
	 * calls to {@link #convert(ByteBuffer)}.
	 * 
	 * @param from
	 *            - byte buffer to convert
	 * @param lookahead
	 *            - lookahead callback
	 * @return - byte buffer, use {@link ByteBuffer#position()} for starting
	 *         array offset, {@link ByteBuffer#limit()} for number of bytes to
	 *         read, and {@link ByteBuffer#array()} for the byte[] itself.
	 */
	public ByteBuffer convert(ByteBuffer from, ILookahead lookahead) {
		ByteBuffer converted = null;

		// Check if there are any left over bytes that weren't converted from
		// the last chunk
		if (underflow != null) {
			ByteBuffer joinedBuffer = ByteBuffer.allocate(from.array().length
					+ underflow.length);
			joinedBuffer.put(underflow);
			joinedBuffer.put(from);
			from = joinedBuffer;
			from.rewind();
			this.underflow = null;
		}

		CharBuffer sourceChars = CharBuffer.allocate(Math.round(decoder
				.maxCharsPerByte()
				* from.limit()) + 1);

		decoder.decode(from, sourceChars, true);
		sourceChars.flip();

		if (lookahead != null && sourceChars.limit() > 0) {

			// Get an array of bytes to attempt to convert
			byte[] ahead = lookahead.bytesToAdd(sourceChars.charAt(sourceChars
					.limit() - 1));

			// Look until no more lookahead is triggered by callback
			while (ahead != null && ahead.length > 0) {
				byte[] next = null;

				// Join lookahead with previous underflow from last call to
				// decode if present
				if (from.hasRemaining()) {
					int remaining = from.remaining();
					next = new byte[ahead.length + remaining];
					from.get(next, 0, remaining);
					System.arraycopy(ahead, 0, next, remaining, ahead.length);
				} else {
					next = ahead;
				}

				from = ByteBuffer.wrap(next);
				// Create new char buffer with underflow + lookahead
				CharBuffer aheadChars = CharBuffer.allocate(Math.round(decoder
						.maxCharsPerByte()
						* from.limit()) + 1);

				// Decode underflow + lookahead
				decoder.decode(from, aheadChars, true);
				aheadChars.flip();

				// If decoding produced at least one usable character than join
				// with main char buffer and query for more lookahead based on
				// new ending char in buffer
				if (aheadChars.limit() > 0) {
					CharBuffer joinedChars = CharBuffer.allocate(aheadChars
							.limit()
							+ sourceChars.limit());
					joinedChars.put(sourceChars);
					joinedChars.put(aheadChars);
					sourceChars = joinedChars;
					sourceChars.rewind();
					ahead = lookahead.bytesToAdd(sourceChars.charAt(sourceChars
							.limit() - 1));
				} else {
					// If no chars were decoded then break out of loop
					ahead = null;
				}
			}
		}

		// Store any left over bytes for the next write chunk
		if (from.hasRemaining()) {
			byte[] leftOver = new byte[from.remaining()];
			from.get(leftOver, 0, from.remaining());
			this.underflow = leftOver;
		}

		// Encode back to byte buffer
		converted = convert(sourceChars);
		return converted;
	}

	/**
	 * Convert a byte buffer by decoding using the fromCharset and encoding
	 * using the toCharset. The byte buffer returned will have its position be
	 * the array offset to use and the limit be the length of bytes to read from
	 * the byte buffer's backing array.
	 * 
	 * Any remaining bytes that couldn't be converted are stored locally until
	 * the next call to {@link #convert(ByteBuffer)}. The from buffer specified
	 * will be joined with the underflow from a previous call on subsequent
	 * calls to {@link #convert(ByteBuffer)}.
	 * 
	 * @param from
	 *            - byte buffer to convert
	 * @return - byte buffer, use {@link ByteBuffer#position()} for starting
	 *         array offset, {@link ByteBuffer#limit()} for number of bytes to
	 *         read, and {@link ByteBuffer#array()} for the byte[] itself.
	 */
	public ByteBuffer convert(ByteBuffer from) {
		return convert(from, null);
	}

}
#	Change	User	Description
#1	19903	stuartrowe	Branching //guest/perforce_software/p4java/... to //guest/stuartrowe/p4java/...
//guest/perforce_software/p4java/r14.1/src/main/java/com/perforce/p4java/CharsetConverter.java
#1	12541	Matt Attaway	Initial add of the 14.1 p4java source code