htmlparser/jericho/StreamEncodingDetector.java

// Jericho HTML Parser - Java based library for analysing and manipulating HTML
// Version 3.2
// Copyright (C) 2004-2009 Martin Jericho
// http://jericho.htmlparser.net/
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of either one of the following licences:
//
// 1. The Eclipse Public License (EPL) version 1.0,
// included in this distribution in the file licence-epl-1.0.html
// or available at http://www.eclipse.org/legal/epl-v10.html
//
// 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
// included in this distribution in the file licence-lgpl-2.1.txt
// or available at http://www.gnu.org/licenses/lgpl.txt
//
// This library is distributed on an "AS IS" basis,
// WITHOUT WARRANTY OF ANY KIND, either express or implied.
// See the individual licence texts for more details.

package net.htmlparser.jericho;

import java.util.*;
import java.io.*;
import java.nio.charset.*;
import java.net.*;

/**
 * Based on information in:
 * http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
 * http://www.w3.org/TR/html401/charset.html#h-5.2
 */
final class StreamEncodingDetector {
        private final InputStream inputStream;
        private String encoding=null;
        private String encodingSpecificationInfo=null;
        private boolean definitive=true;
        private boolean documentSpecifiedEncodingPossible=true;
        private boolean endOfFile;
        private final LoggerQueue logger=new LoggerQueue();
        
        private static final String UTF_16="UTF-16";
        private static final String UTF_16BE="UTF-16BE";
        private static final String UTF_16LE="UTF-16LE";
        private static final String UTF_8="UTF-8";
        private static final String ISO_8859_1="ISO-8859-1";
        private static final String EBCDIC="Cp037"; // aka IBM037, not guaranteed, but available on most platforms
        private static final String WINDOWS_1252="Cp1252"; // aka Windows-1252, not guaranteed
        private static final String DEFAULT_8BIT=EncodingDetector.isEncodingSupported(WINDOWS_1252)?WINDOWS_1252:ISO_8859_1;

        // All of the following encodings are generally not supported in java and will usually throw an exception if decoding is attempted.
        // Specified explicitly using Byte Order Mark:
        private static final String SCSU="SCSU";
        private static final String UTF_7="UTF-7";
        private static final String UTF_EBCDIC="UTF-EBCDIC";
        private static final String BOCU_1="BOCU-1";
        private static final String UTF_32="UTF-32";
        // Guessed from presence of 00 bytes in first four bytes:
        private static final String UTF_32BE="UTF-32BE";
        private static final String UTF_32LE="UTF-32LE";

        public StreamEncodingDetector(final URLConnection urlConnection) throws IOException {
                final HttpURLConnection httpURLConnection=(urlConnection instanceof HttpURLConnection) ? (HttpURLConnection)urlConnection : null;
                // urlConnection.setRequestProperty("Accept-Charset","UTF-8, ISO-8859-1;q=0"); // used for debugging
                final InputStream urlInputStream=urlConnection.getInputStream();
                final String contentType=urlConnection.getContentType();
                if (contentType!=null) {
                        encoding=Source.getCharsetParameterFromHttpHeaderValue(contentType);
                        if (encoding!=null && encoding.length()>0) {
                                boolean encodingSupported=false;
                                try {
                                        if (Charset.isSupported(encoding)) encodingSupported=true;
                                } catch (IllegalCharsetNameException ex) {
                                        if (encoding.charAt(0)=='"') {
                                                String encodingWithoutQuotes=encoding.replace("\"","");
                                                if (EncodingDetector.isEncodingSupported(encodingWithoutQuotes)) {
                                                        logger.warn("Encoding "+encoding+" specified in HTTP header is illegaly delimited with double quotes, which have been ignored");
                                                        encodingSupported=true;
                                                } else {
                                                        logger.warn("Encoding "+encoding+" specified in HTTP header is illegaly delimited with double quotes");
                                                }
                                                encoding=encodingWithoutQuotes;
                                        }
                                }
                                if (encodingSupported) {
                                        inputStream=urlInputStream;
                                        encodingSpecificationInfo="HTTP header Content-Type: "+contentType;
                                        return;
                                }
                                logger.warn("Encoding "+encoding+" specified in HTTP header is not supported, attempting other means of detection");
                        }
                }
                inputStream=urlInputStream.markSupported() ? urlInputStream : new BufferedInputStream(urlInputStream);
                init();
        }

        public StreamEncodingDetector(final InputStream inputStream) throws IOException {
                this.inputStream=inputStream.markSupported() ? inputStream : new BufferedInputStream(inputStream);
                init();
        }
        
        public InputStream getInputStream() {
                return inputStream;
        }

        public String getEncoding() {
                return encoding;
        }
        
        public String getEncodingSpecificationInfo() {
                return encodingSpecificationInfo;
        }

        public boolean isDifinitive() {
                return definitive;
        }

        public boolean isEndOfFile() {
                return endOfFile;
        }

        public boolean isDocumentSpecifiedEncodingPossible() {
                return documentSpecifiedEncodingPossible;
        }

        public LoggerQueue getLoggerQueue() {
                return logger;
        }

        public Reader openReader() throws UnsupportedEncodingException {
                if (encoding==null) return new InputStreamReader(inputStream,DEFAULT_8BIT); // encoding==null only if input stream is empty so use an arbitrary encoding.
                if (!EncodingDetector.isEncodingSupported(encoding)) throw new UnsupportedEncodingException(encoding+" - "+encodingSpecificationInfo);
                return new InputStreamReader(inputStream,encoding);
        }

        private boolean setEncoding(final String encoding, final String encodingSpecificationInfo, int skipChars, boolean endOfFile) throws IOException {
                this.encoding=encoding;
                this.encodingSpecificationInfo=encodingSpecificationInfo;
                this.endOfFile=endOfFile;
                for (int i=0; i<skipChars; i++) inputStream.read();
                return true;
        }

        private boolean init() throws IOException {
                inputStream.mark(5);
                final int b1=inputStream.read();
                if (b1==-1) return setEncoding(null,"empty input stream",0,true);
                final int b2=inputStream.read();
                final int b3=inputStream.read();
                final int b4=inputStream.read();
                final int b5=inputStream.read();
                inputStream.reset();
                // Check for Unicode Byte Order Mark:
                if (b1==0xEF) {
                        if (b2==0xBB && b3==0xBF) return setEncoding(UTF_8,"UTF-8 Byte Order Mark (EF BB BF)",3,b4==-1);
                } else if (b1==0xFE) {
                        if (b2==0xFF) return setEncoding(UTF_16,"UTF-16 big-endian Byte Order Mark (FE FF)",2,b3==-1);
                } else if (b1==0xFF) {
                        if (b2==0xFE) {
                                if (b3==0 && b4==0) return setEncoding(UTF_32,"UTF-32 little-endian Byte Order Mark (FF EE 00 00)",4,b5==-1);
                                return setEncoding(UTF_16,"UTF-16 little-endian Byte Order Mark (FF EE)",2,b3==-1);
                        }
                } else if (b1==0) {
                        if (b2==0 && b3==0xFE && b4==0xFF) return setEncoding(UTF_32,"UTF-32 big-endian Byte Order Mark (00 00 FE FF)",4,b5==-1);
                } else if (b1==0x0E) {
                        if (b2==0xFE && b3==0xFF) return setEncoding(SCSU,"SCSU Byte Order Mark (0E FE FF)",3,b4==-1);
                } else if (b1==0x2B) {
                        if (b2==0x2F && b3==0x76) return setEncoding(UTF_7,"UTF-7 Byte Order Mark (2B 2F 76)",3,b4==-1);
                } else if (b1==0xDD) {
                        if (b2==0x73 && b3==0x66 && b4==0x73) return setEncoding(UTF_EBCDIC,"UTF-EBCDIC Byte Order Mark (DD 73 66 73)",4,b5==-1);
                } else if (b1==0xFB) {
                        if (b2==0xEE && b3==0x28) return setEncoding(BOCU_1,"BOCU-1 Byte Order Mark (FB EE 28)",3,b4==-1);
                }
                // No Unicode Byte Order Mark found.  Have to start guessing.
                definitive=false;
                // The best we can do is to provide an encoding that reflects the correct number and ordering of bytes for characters in the ASCII range.
                // The result will be one of DEFAULT_8BIT, EBCDIC, UTF_16BE, UTF_16LE, UTF_32BE or UTF_32LE.
                // Assumes 00 bytes indicate multi-byte encodings rather than the presence of NUL characters or characters with a code that is a multiple of 0x100.
                if (b4==-1) {
                        // The stream contains between 1 and 3 bytes.
                        // This means the document can't possibly specify the encoding, so make a best guess based on the first 3 bytes.
                        documentSpecifiedEncodingPossible=false;
                        // It might be possible to rule out some encodings based on these bytes, but it is impossible to make a definite determination.
                        // The main thing to determine is whether it is an 8-bit or 16-bit encoding.
                        // In order to guess the most likely encoding, assume that the text contains only ASCII characters, and that any 00 bytes indicate a 16-bit encoding.
                        // The only strictly 8-bit encoding guaranteed to be supported on all java platforms is ISO-8859-1 (UTF-8 uses a variable number of bytes per character).
                        // If no 00 bytes are present it is safest to assume WINDOWS_1252, as this accepts the full range of values 00-FF in every byte.
                        if (b2==-1 || b3!=-1) return setEncoding(DEFAULT_8BIT,"default 8-bit ASCII-compatible encoding (stream 3 bytes long)",0,false); // The stream contains exactly 1 or 3 bytes, so assume an 8-bit encoding regardless of whether any 00 bytes are present.
                        // The stream contains exactly 2 bytes.
                        if (b1==0) return setEncoding(UTF_16BE,"default 16-bit BE encoding (byte stream starts with 00, stream 2 bytes long)",0,false);
                        if (b2==0) return setEncoding(UTF_16LE,"default 16-bit LE encoding (byte stream pattern XX 00, stream 2 bytes long)",0,false);
                        // No 00 bytes present, assume 8-bit encoding:
                        return setEncoding(DEFAULT_8BIT,"default 8-bit ASCII-compatible encoding (no 00 bytes present, stream 2 bytes long)",0,false);
                }
                // Stream contains at least 4 bytes.
                // The patterns used for documentation are made up of:
                //   0 - zero byte
                //   X - non-zero byte
                //   ? - byte value not yet determined
                if (b1==0) {
                        // pattern 0???
                        if (b2==0) return setEncoding(UTF_32BE,"default 32-bit BE encoding (byte stream starts with 00 00)",0,false); // pattern 00?? most likely indicates UTF-32BE
                        // pattern 0X??
                        // Regardless of the final two bytes, assume that the first two bytes indicate a 16-bit BE encoding.
                        // There are many circumstances where this could be an incorrect assumption, for example:
                        //   - UTF-16LE encoding with first character U+0100 (or any other character whose code is a multiple of 100Hex)
                        //   - any encoding with first character NUL
                        //   - UTF-32BE encoding with first character outside of Basic Multilingual Plane (BMP)
                        // Checking the final two bytes might give some clues as to whether any of these other situations are more likely,
                        // but none of the clues will yield less than a 50% chance that the encoding is in fact UTF-16BE as suggested by the first two bytes.
                        return setEncoding(UTF_16BE,"default 16-bit BE encoding (byte stream starts with 00)",0,false); // >=50% chance that encoding is UTF-16BE
                }
                // pattern X???
                if (b4==0) {
                        // pattern X??0
                        if (b3==0) return setEncoding(UTF_32LE,"default 32-bit LE encoding (byte stream starts with pattern XX ?? 00 00)",0,false); // pattern X?00 most likely indicates UTF-32LE
                        // pattern X?X0
                        return setEncoding(UTF_16LE,"default 16-bit LE encoding (byte stream stars with pattern XX ?? XX 00)",0,false); // Regardless of the second byte, assume the fourth 00 byte indicates UTF-16LE.
                }
                // pattern X??X
                if (b2==0) {
                        // pattern X0?X
                        // Assuming the second 00 byte doesn't indicate a NUL character, and that it is very unlikely that this is a 32-bit encoding
                        // of a character outside of the BMP, we can assume that it indicates a 16-bit encoding.
                        // If the pattern is X00X, there is a 50/50 chance that the encoding is BE or LE, with one of the characters have a code that is a multiple of 0x100.
                        // This should be a very rare occurrence, and there is no more than a 50% chance that the encoding
                        // will be different to that assumed (UTF-16LE) without checking for this occurrence, so don't bother checking for it.
                        // If the pattern is X0XX, this is likely to indicate a 16-bit LE encoding with the second character > U+00FF.
                        return setEncoding(UTF_16LE,"default 16-bit LE encoding (byte stream starts with pattern XX 00 ?? XX)",0,false);
                }
                // pattern XX?X
                if (b3==0) return setEncoding(UTF_16BE,"default 16-bit BE encoding (byte stream starts with pattern XX XX 00 XX)",0,false); // pattern XX0X likely to indicate a 16-bit BE encoding with the first character > U+00FF.
                // pattern XXXX
                // Although it is still possible that this is a 16-bit encoding with the first two characters > U+00FF
                // Assume the more likely case of four 8-bit characters <= U+00FF.
                // Check whether it fits some common EBCDIC strings that might be found at the start of a document:
                if (b1==0x4C) { // first character is EBCDIC '<' (ASCII 'L'), check a couple more characters before assuming EBCDIC encoding:
                        if (b2==0x6F && b3==0xA7 && b4==0x94) return setEncoding(EBCDIC,"default EBCDIC encoding (<?xml...> detected)",0,false); // first four bytes are "<?xm" in EBCDIC ("Lo§" in Windows-1252)
                        if (b2==0x5A && b3==0xC4 && b4==0xD6) return setEncoding(EBCDIC,"default EBCDIC encoding (<!DOCTYPE...> detected)",0,false); // first four bytes are "<!DO" in EBCDIC ("LZÄÖ" in Windows-1252)
                        if ((b2&b3&b4&0x80)!=0) return setEncoding(EBCDIC,"default EBCDIC-compatible encoding (HTML element detected)",0,false); // all of the 3 bytes after the '<' have the high-order bit set, indicating EBCDIC letters such as "<HTM" ("LÈãÔ" in Windows-1252), or "<htm" ("L£" in Windows-1252)
                        // although this is not an exhaustive check for EBCDIC, it is safer to assume a more common preliminary encoding if none of these conditions are met.
                }
                // Now confident that it is not EBCDIC, but some other 8-bit encoding.
                // Most other 8-bit encodings are compatible with ASCII.
                // Since a document specified encoding requires only ASCII characters, just choose an arbitrary 8-bit preliminary encoding.
                // UTF-8 is however not a good choice as it is not strictly an 8-bit encoding.
                // UTF-8 bytes with a value >= 0x80 indicate the presence of a multi-byte character, and there are many byte values that are illegal.
                // Therefore, choose the only true 8-bit encoding that accepts all byte values and is guaranteed to be available on all java implementations.
                return setEncoding(DEFAULT_8BIT,"default 8-bit ASCII-compatible encoding (no 00 bytes present in first four bytes of stream)",0,false);
        }
}
1	// Jericho HTML Parser - Java based library for analysing and manipulating HTML
2	// Version 3.2
3	// Copyright (C) 2004-2009 Martin Jericho
4	// http://jericho.htmlparser.net/
5	//
6	// This library is free software; you can redistribute it and/or
7	// modify it under the terms of either one of the following licences:
8	//
9	// 1. The Eclipse Public License (EPL) version 1.0,
10	// included in this distribution in the file licence-epl-1.0.html
11	// or available at http://www.eclipse.org/legal/epl-v10.html
12	//
13	// 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
14	// included in this distribution in the file licence-lgpl-2.1.txt
15	// or available at http://www.gnu.org/licenses/lgpl.txt
16	//
17	// This library is distributed on an "AS IS" basis,
18	// WITHOUT WARRANTY OF ANY KIND, either express or implied.
19	// See the individual licence texts for more details.
20
21	package net.htmlparser.jericho;
22
23	import java.util.*;
24	import java.io.*;
25	import java.nio.charset.*;
26	import java.net.*;
27
28	/**
29	* Based on information in:
30	* http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
31	* http://www.w3.org/TR/html401/charset.html#h-5.2
32	*/
33	final class StreamEncodingDetector {
34	private final InputStream inputStream;
35	private String encoding=null;
36	private String encodingSpecificationInfo=null;
37	private boolean definitive=true;
38	private boolean documentSpecifiedEncodingPossible=true;
39	private boolean endOfFile;
40	private final LoggerQueue logger=new LoggerQueue();
41
42	private static final String UTF_16="UTF-16";
43	private static final String UTF_16BE="UTF-16BE";
44	private static final String UTF_16LE="UTF-16LE";
45	private static final String UTF_8="UTF-8";
46	private static final String ISO_8859_1="ISO-8859-1";
47	private static final String EBCDIC="Cp037"; // aka IBM037, not guaranteed, but available on most platforms
48	private static final String WINDOWS_1252="Cp1252"; // aka Windows-1252, not guaranteed
49	private static final String DEFAULT_8BIT=EncodingDetector.isEncodingSupported(WINDOWS_1252)?WINDOWS_1252:ISO_8859_1;
50
51	// All of the following encodings are generally not supported in java and will usually throw an exception if decoding is attempted.
52	// Specified explicitly using Byte Order Mark:
53	private static final String SCSU="SCSU";
54	private static final String UTF_7="UTF-7";
55	private static final String UTF_EBCDIC="UTF-EBCDIC";
56	private static final String BOCU_1="BOCU-1";
57	private static final String UTF_32="UTF-32";
58	// Guessed from presence of 00 bytes in first four bytes:
59	private static final String UTF_32BE="UTF-32BE";
60	private static final String UTF_32LE="UTF-32LE";
61
62	public StreamEncodingDetector(final URLConnection urlConnection) throws IOException {
63	final HttpURLConnection httpURLConnection=(urlConnection instanceof HttpURLConnection) ? (HttpURLConnection)urlConnection : null;
64	// urlConnection.setRequestProperty("Accept-Charset","UTF-8, ISO-8859-1;q=0"); // used for debugging
65	final InputStream urlInputStream=urlConnection.getInputStream();
66	final String contentType=urlConnection.getContentType();
67	if (contentType!=null) {
68	encoding=Source.getCharsetParameterFromHttpHeaderValue(contentType);
69	if (encoding!=null && encoding.length()>0) {
70	boolean encodingSupported=false;
71	try {
72	if (Charset.isSupported(encoding)) encodingSupported=true;
73	} catch (IllegalCharsetNameException ex) {
74	if (encoding.charAt(0)=='"') {
75	String encodingWithoutQuotes=encoding.replace("\"","");
76	if (EncodingDetector.isEncodingSupported(encodingWithoutQuotes)) {
77	logger.warn("Encoding "+encoding+" specified in HTTP header is illegaly delimited with double quotes, which have been ignored");
78	encodingSupported=true;
79	} else {
80	logger.warn("Encoding "+encoding+" specified in HTTP header is illegaly delimited with double quotes");
81	}
82	encoding=encodingWithoutQuotes;
83	}
84	}
85	if (encodingSupported) {
86	inputStream=urlInputStream;
87	encodingSpecificationInfo="HTTP header Content-Type: "+contentType;
88	return;
89	}
90	logger.warn("Encoding "+encoding+" specified in HTTP header is not supported, attempting other means of detection");
91	}
92	}
93	inputStream=urlInputStream.markSupported() ? urlInputStream : new BufferedInputStream(urlInputStream);
94	init();
95	}
96
97	public StreamEncodingDetector(final InputStream inputStream) throws IOException {
98	this.inputStream=inputStream.markSupported() ? inputStream : new BufferedInputStream(inputStream);
99	init();
100	}
101
102	public InputStream getInputStream() {
103	return inputStream;
104	}
105
106	public String getEncoding() {
107	return encoding;
108	}
109
110	public String getEncodingSpecificationInfo() {
111	return encodingSpecificationInfo;
112	}
113
114	public boolean isDifinitive() {
115	return definitive;
116	}
117
118	public boolean isEndOfFile() {
119	return endOfFile;
120	}
121
122	public boolean isDocumentSpecifiedEncodingPossible() {
123	return documentSpecifiedEncodingPossible;
124	}
125
126	public LoggerQueue getLoggerQueue() {
127	return logger;
128	}
129
130	public Reader openReader() throws UnsupportedEncodingException {
131	if (encoding==null) return new InputStreamReader(inputStream,DEFAULT_8BIT); // encoding==null only if input stream is empty so use an arbitrary encoding.
132	if (!EncodingDetector.isEncodingSupported(encoding)) throw new UnsupportedEncodingException(encoding+" - "+encodingSpecificationInfo);
133	return new InputStreamReader(inputStream,encoding);
134	}
135
136	private boolean setEncoding(final String encoding, final String encodingSpecificationInfo, int skipChars, boolean endOfFile) throws IOException {
137	this.encoding=encoding;
138	this.encodingSpecificationInfo=encodingSpecificationInfo;
139	this.endOfFile=endOfFile;
140	for (int i=0; i<skipChars; i++) inputStream.read();
141	return true;
142	}
143
144	private boolean init() throws IOException {
145	inputStream.mark(5);
146	final int b1=inputStream.read();
147	if (b1==-1) return setEncoding(null,"empty input stream",0,true);
148	final int b2=inputStream.read();
149	final int b3=inputStream.read();
150	final int b4=inputStream.read();
151	final int b5=inputStream.read();
152	inputStream.reset();
153	// Check for Unicode Byte Order Mark:
154	if (b1==0xEF) {
155	if (b2==0xBB && b3==0xBF) return setEncoding(UTF_8,"UTF-8 Byte Order Mark (EF BB BF)",3,b4==-1);
156	} else if (b1==0xFE) {
157	if (b2==0xFF) return setEncoding(UTF_16,"UTF-16 big-endian Byte Order Mark (FE FF)",2,b3==-1);
158	} else if (b1==0xFF) {
159	if (b2==0xFE) {
160	if (b3==0 && b4==0) return setEncoding(UTF_32,"UTF-32 little-endian Byte Order Mark (FF EE 00 00)",4,b5==-1);
161	return setEncoding(UTF_16,"UTF-16 little-endian Byte Order Mark (FF EE)",2,b3==-1);
162	}
163	} else if (b1==0) {
164	if (b2==0 && b3==0xFE && b4==0xFF) return setEncoding(UTF_32,"UTF-32 big-endian Byte Order Mark (00 00 FE FF)",4,b5==-1);
165	} else if (b1==0x0E) {
166	if (b2==0xFE && b3==0xFF) return setEncoding(SCSU,"SCSU Byte Order Mark (0E FE FF)",3,b4==-1);
167	} else if (b1==0x2B) {
168	if (b2==0x2F && b3==0x76) return setEncoding(UTF_7,"UTF-7 Byte Order Mark (2B 2F 76)",3,b4==-1);
169	} else if (b1==0xDD) {
170	if (b2==0x73 && b3==0x66 && b4==0x73) return setEncoding(UTF_EBCDIC,"UTF-EBCDIC Byte Order Mark (DD 73 66 73)",4,b5==-1);
171	} else if (b1==0xFB) {
172	if (b2==0xEE && b3==0x28) return setEncoding(BOCU_1,"BOCU-1 Byte Order Mark (FB EE 28)",3,b4==-1);
173	}
174	// No Unicode Byte Order Mark found. Have to start guessing.
175	definitive=false;
176	// The best we can do is to provide an encoding that reflects the correct number and ordering of bytes for characters in the ASCII range.
177	// The result will be one of DEFAULT_8BIT, EBCDIC, UTF_16BE, UTF_16LE, UTF_32BE or UTF_32LE.
178	// Assumes 00 bytes indicate multi-byte encodings rather than the presence of NUL characters or characters with a code that is a multiple of 0x100.
179	if (b4==-1) {
180	// The stream contains between 1 and 3 bytes.
181	// This means the document can't possibly specify the encoding, so make a best guess based on the first 3 bytes.
182	documentSpecifiedEncodingPossible=false;
183	// It might be possible to rule out some encodings based on these bytes, but it is impossible to make a definite determination.
184	// The main thing to determine is whether it is an 8-bit or 16-bit encoding.
185	// In order to guess the most likely encoding, assume that the text contains only ASCII characters, and that any 00 bytes indicate a 16-bit encoding.
186	// The only strictly 8-bit encoding guaranteed to be supported on all java platforms is ISO-8859-1 (UTF-8 uses a variable number of bytes per character).
187	// If no 00 bytes are present it is safest to assume WINDOWS_1252, as this accepts the full range of values 00-FF in every byte.
188	if (b2==-1 \|\| b3!=-1) return setEncoding(DEFAULT_8BIT,"default 8-bit ASCII-compatible encoding (stream 3 bytes long)",0,false); // The stream contains exactly 1 or 3 bytes, so assume an 8-bit encoding regardless of whether any 00 bytes are present.
189	// The stream contains exactly 2 bytes.
190	if (b1==0) return setEncoding(UTF_16BE,"default 16-bit BE encoding (byte stream starts with 00, stream 2 bytes long)",0,false);
191	if (b2==0) return setEncoding(UTF_16LE,"default 16-bit LE encoding (byte stream pattern XX 00, stream 2 bytes long)",0,false);
192	// No 00 bytes present, assume 8-bit encoding:
193	return setEncoding(DEFAULT_8BIT,"default 8-bit ASCII-compatible encoding (no 00 bytes present, stream 2 bytes long)",0,false);
194	}
195	// Stream contains at least 4 bytes.
196	// The patterns used for documentation are made up of:
197	// 0 - zero byte
198	// X - non-zero byte
199	// ? - byte value not yet determined
200	if (b1==0) {
201	// pattern 0???
202	if (b2==0) return setEncoding(UTF_32BE,"default 32-bit BE encoding (byte stream starts with 00 00)",0,false); // pattern 00?? most likely indicates UTF-32BE
203	// pattern 0X??
204	// Regardless of the final two bytes, assume that the first two bytes indicate a 16-bit BE encoding.
205	// There are many circumstances where this could be an incorrect assumption, for example:
206	// - UTF-16LE encoding with first character U+0100 (or any other character whose code is a multiple of 100Hex)
207	// - any encoding with first character NUL
208	// - UTF-32BE encoding with first character outside of Basic Multilingual Plane (BMP)
209	// Checking the final two bytes might give some clues as to whether any of these other situations are more likely,
210	// but none of the clues will yield less than a 50% chance that the encoding is in fact UTF-16BE as suggested by the first two bytes.
211	return setEncoding(UTF_16BE,"default 16-bit BE encoding (byte stream starts with 00)",0,false); // >=50% chance that encoding is UTF-16BE
212	}
213	// pattern X???
214	if (b4==0) {
215	// pattern X??0
216	if (b3==0) return setEncoding(UTF_32LE,"default 32-bit LE encoding (byte stream starts with pattern XX ?? 00 00)",0,false); // pattern X?00 most likely indicates UTF-32LE
217	// pattern X?X0
218	return setEncoding(UTF_16LE,"default 16-bit LE encoding (byte stream stars with pattern XX ?? XX 00)",0,false); // Regardless of the second byte, assume the fourth 00 byte indicates UTF-16LE.
219	}
220	// pattern X??X
221	if (b2==0) {
222	// pattern X0?X
223	// Assuming the second 00 byte doesn't indicate a NUL character, and that it is very unlikely that this is a 32-bit encoding
224	// of a character outside of the BMP, we can assume that it indicates a 16-bit encoding.
225	// If the pattern is X00X, there is a 50/50 chance that the encoding is BE or LE, with one of the characters have a code that is a multiple of 0x100.
226	// This should be a very rare occurrence, and there is no more than a 50% chance that the encoding
227	// will be different to that assumed (UTF-16LE) without checking for this occurrence, so don't bother checking for it.
228	// If the pattern is X0XX, this is likely to indicate a 16-bit LE encoding with the second character > U+00FF.
229	return setEncoding(UTF_16LE,"default 16-bit LE encoding (byte stream starts with pattern XX 00 ?? XX)",0,false);
230	}
231	// pattern XX?X
232	if (b3==0) return setEncoding(UTF_16BE,"default 16-bit BE encoding (byte stream starts with pattern XX XX 00 XX)",0,false); // pattern XX0X likely to indicate a 16-bit BE encoding with the first character > U+00FF.
233	// pattern XXXX
234	// Although it is still possible that this is a 16-bit encoding with the first two characters > U+00FF
235	// Assume the more likely case of four 8-bit characters <= U+00FF.
236	// Check whether it fits some common EBCDIC strings that might be found at the start of a document:
237	if (b1==0x4C) { // first character is EBCDIC '<' (ASCII 'L'), check a couple more characters before assuming EBCDIC encoding:
238	if (b2==0x6F && b3==0xA7 && b4==0x94) return setEncoding(EBCDIC,"default EBCDIC encoding (<?xml...> detected)",0,false); // first four bytes are "<?xm" in EBCDIC ("Lo§" in Windows-1252)
239	if (b2==0x5A && b3==0xC4 && b4==0xD6) return setEncoding(EBCDIC,"default EBCDIC encoding (<!DOCTYPE...> detected)",0,false); // first four bytes are "<!DO" in EBCDIC ("LZÄÖ" in Windows-1252)
240	if ((b2&b3&b4&0x80)!=0) return setEncoding(EBCDIC,"default EBCDIC-compatible encoding (HTML element detected)",0,false); // all of the 3 bytes after the '<' have the high-order bit set, indicating EBCDIC letters such as "<HTM" ("LÈãÔ" in Windows-1252), or "<htm" ("L£" in Windows-1252)
241	// although this is not an exhaustive check for EBCDIC, it is safer to assume a more common preliminary encoding if none of these conditions are met.
242	}
243	// Now confident that it is not EBCDIC, but some other 8-bit encoding.
244	// Most other 8-bit encodings are compatible with ASCII.
245	// Since a document specified encoding requires only ASCII characters, just choose an arbitrary 8-bit preliminary encoding.
246	// UTF-8 is however not a good choice as it is not strictly an 8-bit encoding.
247	// UTF-8 bytes with a value >= 0x80 indicate the presence of a multi-byte character, and there are many byte values that are illegal.
248	// Therefore, choose the only true 8-bit encoding that accepts all byte values and is guaranteed to be available on all java implementations.
249	return setEncoding(DEFAULT_8BIT,"default 8-bit ASCII-compatible encoding (no 00 bytes present in first four bytes of stream)",0,false);
250	}
251	}