1 |
// Jericho HTML Parser - Java based library for analysing and manipulating HTML
|
2 |
// Version 3.2
|
3 |
// Copyright (C) 2004-2009 Martin Jericho
|
4 |
// http://jericho.htmlparser.net/
|
5 |
//
|
6 |
// This library is free software; you can redistribute it and/or
|
7 |
// modify it under the terms of either one of the following licences:
|
8 |
//
|
9 |
// 1. The Eclipse Public License (EPL) version 1.0,
|
10 |
// included in this distribution in the file licence-epl-1.0.html
|
11 |
// or available at http://www.eclipse.org/legal/epl-v10.html
|
12 |
//
|
13 |
// 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
|
14 |
// included in this distribution in the file licence-lgpl-2.1.txt
|
15 |
// or available at http://www.gnu.org/licenses/lgpl.txt
|
16 |
//
|
17 |
// This library is distributed on an "AS IS" basis,
|
18 |
// WITHOUT WARRANTY OF ANY KIND, either express or implied.
|
19 |
// See the individual licence texts for more details.
|
20 |
|
21 |
package net.htmlparser.jericho;
|
22 |
|
23 |
import java.util.*;
|
24 |
import java.io.*;
|
25 |
import java.nio.charset.*;
|
26 |
import java.net.*;
|
27 |
|
28 |
/**
|
29 |
* Based on information in:
|
30 |
* http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
|
31 |
* http://www.w3.org/TR/html401/charset.html#h-5.2
|
32 |
*/
|
33 |
final class StreamEncodingDetector {
|
34 |
private final InputStream inputStream;
|
35 |
private String encoding=null;
|
36 |
private String encodingSpecificationInfo=null;
|
37 |
private boolean definitive=true;
|
38 |
private boolean documentSpecifiedEncodingPossible=true;
|
39 |
private boolean endOfFile;
|
40 |
private final LoggerQueue logger=new LoggerQueue();
|
41 |
|
42 |
private static final String UTF_16="UTF-16";
|
43 |
private static final String UTF_16BE="UTF-16BE";
|
44 |
private static final String UTF_16LE="UTF-16LE";
|
45 |
private static final String UTF_8="UTF-8";
|
46 |
private static final String ISO_8859_1="ISO-8859-1";
|
47 |
private static final String EBCDIC="Cp037"; // aka IBM037, not guaranteed, but available on most platforms
|
48 |
private static final String WINDOWS_1252="Cp1252"; // aka Windows-1252, not guaranteed
|
49 |
private static final String DEFAULT_8BIT=EncodingDetector.isEncodingSupported(WINDOWS_1252)?WINDOWS_1252:ISO_8859_1;
|
50 |
|
51 |
// All of the following encodings are generally not supported in java and will usually throw an exception if decoding is attempted.
|
52 |
// Specified explicitly using Byte Order Mark:
|
53 |
private static final String SCSU="SCSU";
|
54 |
private static final String UTF_7="UTF-7";
|
55 |
private static final String UTF_EBCDIC="UTF-EBCDIC";
|
56 |
private static final String BOCU_1="BOCU-1";
|
57 |
private static final String UTF_32="UTF-32";
|
58 |
// Guessed from presence of 00 bytes in first four bytes:
|
59 |
private static final String UTF_32BE="UTF-32BE";
|
60 |
private static final String UTF_32LE="UTF-32LE";
|
61 |
|
62 |
public StreamEncodingDetector(final URLConnection urlConnection) throws IOException {
|
63 |
final HttpURLConnection httpURLConnection=(urlConnection instanceof HttpURLConnection) ? (HttpURLConnection)urlConnection : null;
|
64 |
// urlConnection.setRequestProperty("Accept-Charset","UTF-8, ISO-8859-1;q=0"); // used for debugging
|
65 |
final InputStream urlInputStream=urlConnection.getInputStream();
|
66 |
final String contentType=urlConnection.getContentType();
|
67 |
if (contentType!=null) {
|
68 |
encoding=Source.getCharsetParameterFromHttpHeaderValue(contentType);
|
69 |
if (encoding!=null && encoding.length()>0) {
|
70 |
boolean encodingSupported=false;
|
71 |
try {
|
72 |
if (Charset.isSupported(encoding)) encodingSupported=true;
|
73 |
} catch (IllegalCharsetNameException ex) {
|
74 |
if (encoding.charAt(0)=='"') {
|
75 |
String encodingWithoutQuotes=encoding.replace("\"","");
|
76 |
if (EncodingDetector.isEncodingSupported(encodingWithoutQuotes)) {
|
77 |
logger.warn("Encoding "+encoding+" specified in HTTP header is illegaly delimited with double quotes, which have been ignored");
|
78 |
encodingSupported=true;
|
79 |
} else {
|
80 |
logger.warn("Encoding "+encoding+" specified in HTTP header is illegaly delimited with double quotes");
|
81 |
}
|
82 |
encoding=encodingWithoutQuotes;
|
83 |
}
|
84 |
}
|
85 |
if (encodingSupported) {
|
86 |
inputStream=urlInputStream;
|
87 |
encodingSpecificationInfo="HTTP header Content-Type: "+contentType;
|
88 |
return;
|
89 |
}
|
90 |
logger.warn("Encoding "+encoding+" specified in HTTP header is not supported, attempting other means of detection");
|
91 |
}
|
92 |
}
|
93 |
inputStream=urlInputStream.markSupported() ? urlInputStream : new BufferedInputStream(urlInputStream);
|
94 |
init();
|
95 |
}
|
96 |
|
97 |
public StreamEncodingDetector(final InputStream inputStream) throws IOException {
|
98 |
this.inputStream=inputStream.markSupported() ? inputStream : new BufferedInputStream(inputStream);
|
99 |
init();
|
100 |
}
|
101 |
|
102 |
public InputStream getInputStream() {
|
103 |
return inputStream;
|
104 |
}
|
105 |
|
106 |
public String getEncoding() {
|
107 |
return encoding;
|
108 |
}
|
109 |
|
110 |
public String getEncodingSpecificationInfo() {
|
111 |
return encodingSpecificationInfo;
|
112 |
}
|
113 |
|
114 |
public boolean isDifinitive() {
|
115 |
return definitive;
|
116 |
}
|
117 |
|
118 |
public boolean isEndOfFile() {
|
119 |
return endOfFile;
|
120 |
}
|
121 |
|
122 |
public boolean isDocumentSpecifiedEncodingPossible() {
|
123 |
return documentSpecifiedEncodingPossible;
|
124 |
}
|
125 |
|
126 |
public LoggerQueue getLoggerQueue() {
|
127 |
return logger;
|
128 |
}
|
129 |
|
130 |
public Reader openReader() throws UnsupportedEncodingException {
|
131 |
if (encoding==null) return new InputStreamReader(inputStream,DEFAULT_8BIT); // encoding==null only if input stream is empty so use an arbitrary encoding.
|
132 |
if (!EncodingDetector.isEncodingSupported(encoding)) throw new UnsupportedEncodingException(encoding+" - "+encodingSpecificationInfo);
|
133 |
return new InputStreamReader(inputStream,encoding);
|
134 |
}
|
135 |
|
136 |
private boolean setEncoding(final String encoding, final String encodingSpecificationInfo, int skipChars, boolean endOfFile) throws IOException {
|
137 |
this.encoding=encoding;
|
138 |
this.encodingSpecificationInfo=encodingSpecificationInfo;
|
139 |
this.endOfFile=endOfFile;
|
140 |
for (int i=0; i<skipChars; i++) inputStream.read();
|
141 |
return true;
|
142 |
}
|
143 |
|
144 |
private boolean init() throws IOException {
|
145 |
inputStream.mark(5);
|
146 |
final int b1=inputStream.read();
|
147 |
if (b1==-1) return setEncoding(null,"empty input stream",0,true);
|
148 |
final int b2=inputStream.read();
|
149 |
final int b3=inputStream.read();
|
150 |
final int b4=inputStream.read();
|
151 |
final int b5=inputStream.read();
|
152 |
inputStream.reset();
|
153 |
// Check for Unicode Byte Order Mark:
|
154 |
if (b1==0xEF) {
|
155 |
if (b2==0xBB && b3==0xBF) return setEncoding(UTF_8,"UTF-8 Byte Order Mark (EF BB BF)",3,b4==-1);
|
156 |
} else if (b1==0xFE) {
|
157 |
if (b2==0xFF) return setEncoding(UTF_16,"UTF-16 big-endian Byte Order Mark (FE FF)",2,b3==-1);
|
158 |
} else if (b1==0xFF) {
|
159 |
if (b2==0xFE) {
|
160 |
if (b3==0 && b4==0) return setEncoding(UTF_32,"UTF-32 little-endian Byte Order Mark (FF EE 00 00)",4,b5==-1);
|
161 |
return setEncoding(UTF_16,"UTF-16 little-endian Byte Order Mark (FF EE)",2,b3==-1);
|
162 |
}
|
163 |
} else if (b1==0) {
|
164 |
if (b2==0 && b3==0xFE && b4==0xFF) return setEncoding(UTF_32,"UTF-32 big-endian Byte Order Mark (00 00 FE FF)",4,b5==-1);
|
165 |
} else if (b1==0x0E) {
|
166 |
if (b2==0xFE && b3==0xFF) return setEncoding(SCSU,"SCSU Byte Order Mark (0E FE FF)",3,b4==-1);
|
167 |
} else if (b1==0x2B) {
|
168 |
if (b2==0x2F && b3==0x76) return setEncoding(UTF_7,"UTF-7 Byte Order Mark (2B 2F 76)",3,b4==-1);
|
169 |
} else if (b1==0xDD) {
|
170 |
if (b2==0x73 && b3==0x66 && b4==0x73) return setEncoding(UTF_EBCDIC,"UTF-EBCDIC Byte Order Mark (DD 73 66 73)",4,b5==-1);
|
171 |
} else if (b1==0xFB) {
|
172 |
if (b2==0xEE && b3==0x28) return setEncoding(BOCU_1,"BOCU-1 Byte Order Mark (FB EE 28)",3,b4==-1);
|
173 |
}
|
174 |
// No Unicode Byte Order Mark found. Have to start guessing.
|
175 |
definitive=false;
|
176 |
// The best we can do is to provide an encoding that reflects the correct number and ordering of bytes for characters in the ASCII range.
|
177 |
// The result will be one of DEFAULT_8BIT, EBCDIC, UTF_16BE, UTF_16LE, UTF_32BE or UTF_32LE.
|
178 |
// Assumes 00 bytes indicate multi-byte encodings rather than the presence of NUL characters or characters with a code that is a multiple of 0x100.
|
179 |
if (b4==-1) {
|
180 |
// The stream contains between 1 and 3 bytes.
|
181 |
// This means the document can't possibly specify the encoding, so make a best guess based on the first 3 bytes.
|
182 |
documentSpecifiedEncodingPossible=false;
|
183 |
// It might be possible to rule out some encodings based on these bytes, but it is impossible to make a definite determination.
|
184 |
// The main thing to determine is whether it is an 8-bit or 16-bit encoding.
|
185 |
// In order to guess the most likely encoding, assume that the text contains only ASCII characters, and that any 00 bytes indicate a 16-bit encoding.
|
186 |
// The only strictly 8-bit encoding guaranteed to be supported on all java platforms is ISO-8859-1 (UTF-8 uses a variable number of bytes per character).
|
187 |
// If no 00 bytes are present it is safest to assume WINDOWS_1252, as this accepts the full range of values 00-FF in every byte.
|
188 |
if (b2==-1 || b3!=-1) return setEncoding(DEFAULT_8BIT,"default 8-bit ASCII-compatible encoding (stream 3 bytes long)",0,false); // The stream contains exactly 1 or 3 bytes, so assume an 8-bit encoding regardless of whether any 00 bytes are present.
|
189 |
// The stream contains exactly 2 bytes.
|
190 |
if (b1==0) return setEncoding(UTF_16BE,"default 16-bit BE encoding (byte stream starts with 00, stream 2 bytes long)",0,false);
|
191 |
if (b2==0) return setEncoding(UTF_16LE,"default 16-bit LE encoding (byte stream pattern XX 00, stream 2 bytes long)",0,false);
|
192 |
// No 00 bytes present, assume 8-bit encoding:
|
193 |
return setEncoding(DEFAULT_8BIT,"default 8-bit ASCII-compatible encoding (no 00 bytes present, stream 2 bytes long)",0,false);
|
194 |
}
|
195 |
// Stream contains at least 4 bytes.
|
196 |
// The patterns used for documentation are made up of:
|
197 |
// 0 - zero byte
|
198 |
// X - non-zero byte
|
199 |
// ? - byte value not yet determined
|
200 |
if (b1==0) {
|
201 |
// pattern 0???
|
202 |
if (b2==0) return setEncoding(UTF_32BE,"default 32-bit BE encoding (byte stream starts with 00 00)",0,false); // pattern 00?? most likely indicates UTF-32BE
|
203 |
// pattern 0X??
|
204 |
// Regardless of the final two bytes, assume that the first two bytes indicate a 16-bit BE encoding.
|
205 |
// There are many circumstances where this could be an incorrect assumption, for example:
|
206 |
// - UTF-16LE encoding with first character U+0100 (or any other character whose code is a multiple of 100Hex)
|
207 |
// - any encoding with first character NUL
|
208 |
// - UTF-32BE encoding with first character outside of Basic Multilingual Plane (BMP)
|
209 |
// Checking the final two bytes might give some clues as to whether any of these other situations are more likely,
|
210 |
// but none of the clues will yield less than a 50% chance that the encoding is in fact UTF-16BE as suggested by the first two bytes.
|
211 |
return setEncoding(UTF_16BE,"default 16-bit BE encoding (byte stream starts with 00)",0,false); // >=50% chance that encoding is UTF-16BE
|
212 |
}
|
213 |
// pattern X???
|
214 |
if (b4==0) {
|
215 |
// pattern X??0
|
216 |
if (b3==0) return setEncoding(UTF_32LE,"default 32-bit LE encoding (byte stream starts with pattern XX ?? 00 00)",0,false); // pattern X?00 most likely indicates UTF-32LE
|
217 |
// pattern X?X0
|
218 |
return setEncoding(UTF_16LE,"default 16-bit LE encoding (byte stream stars with pattern XX ?? XX 00)",0,false); // Regardless of the second byte, assume the fourth 00 byte indicates UTF-16LE.
|
219 |
}
|
220 |
// pattern X??X
|
221 |
if (b2==0) {
|
222 |
// pattern X0?X
|
223 |
// Assuming the second 00 byte doesn't indicate a NUL character, and that it is very unlikely that this is a 32-bit encoding
|
224 |
// of a character outside of the BMP, we can assume that it indicates a 16-bit encoding.
|
225 |
// If the pattern is X00X, there is a 50/50 chance that the encoding is BE or LE, with one of the characters have a code that is a multiple of 0x100.
|
226 |
// This should be a very rare occurrence, and there is no more than a 50% chance that the encoding
|
227 |
// will be different to that assumed (UTF-16LE) without checking for this occurrence, so don't bother checking for it.
|
228 |
// If the pattern is X0XX, this is likely to indicate a 16-bit LE encoding with the second character > U+00FF.
|
229 |
return setEncoding(UTF_16LE,"default 16-bit LE encoding (byte stream starts with pattern XX 00 ?? XX)",0,false);
|
230 |
}
|
231 |
// pattern XX?X
|
232 |
if (b3==0) return setEncoding(UTF_16BE,"default 16-bit BE encoding (byte stream starts with pattern XX XX 00 XX)",0,false); // pattern XX0X likely to indicate a 16-bit BE encoding with the first character > U+00FF.
|
233 |
// pattern XXXX
|
234 |
// Although it is still possible that this is a 16-bit encoding with the first two characters > U+00FF
|
235 |
// Assume the more likely case of four 8-bit characters <= U+00FF.
|
236 |
// Check whether it fits some common EBCDIC strings that might be found at the start of a document:
|
237 |
if (b1==0x4C) { // first character is EBCDIC '<' (ASCII 'L'), check a couple more characters before assuming EBCDIC encoding:
|
238 |
if (b2==0x6F && b3==0xA7 && b4==0x94) return setEncoding(EBCDIC,"default EBCDIC encoding (<?xml...> detected)",0,false); // first four bytes are "<?xm" in EBCDIC ("Lo§" in Windows-1252)
|
239 |
if (b2==0x5A && b3==0xC4 && b4==0xD6) return setEncoding(EBCDIC,"default EBCDIC encoding (<!DOCTYPE...> detected)",0,false); // first four bytes are "<!DO" in EBCDIC ("LZÄÖ" in Windows-1252)
|
240 |
if ((b2&b3&b4&0x80)!=0) return setEncoding(EBCDIC,"default EBCDIC-compatible encoding (HTML element detected)",0,false); // all of the 3 bytes after the '<' have the high-order bit set, indicating EBCDIC letters such as "<HTM" ("LÈãÔ" in Windows-1252), or "<htm" ("L£" in Windows-1252)
|
241 |
// although this is not an exhaustive check for EBCDIC, it is safer to assume a more common preliminary encoding if none of these conditions are met.
|
242 |
}
|
243 |
// Now confident that it is not EBCDIC, but some other 8-bit encoding.
|
244 |
// Most other 8-bit encodings are compatible with ASCII.
|
245 |
// Since a document specified encoding requires only ASCII characters, just choose an arbitrary 8-bit preliminary encoding.
|
246 |
// UTF-8 is however not a good choice as it is not strictly an 8-bit encoding.
|
247 |
// UTF-8 bytes with a value >= 0x80 indicate the presence of a multi-byte character, and there are many byte values that are illegal.
|
248 |
// Therefore, choose the only true 8-bit encoding that accepts all byte values and is guaranteed to be available on all java implementations.
|
249 |
return setEncoding(DEFAULT_8BIT,"default 8-bit ASCII-compatible encoding (no 00 bytes present in first four bytes of stream)",0,false);
|
250 |
}
|
251 |
}
|