1 |
// Jericho HTML Parser - Java based library for analysing and manipulating HTML
|
2 |
// Version 3.2
|
3 |
// Copyright (C) 2004-2009 Martin Jericho
|
4 |
// http://jericho.htmlparser.net/
|
5 |
//
|
6 |
// This library is free software; you can redistribute it and/or
|
7 |
// modify it under the terms of either one of the following licences:
|
8 |
//
|
9 |
// 1. The Eclipse Public License (EPL) version 1.0,
|
10 |
// included in this distribution in the file licence-epl-1.0.html
|
11 |
// or available at http://www.eclipse.org/legal/epl-v10.html
|
12 |
//
|
13 |
// 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
|
14 |
// included in this distribution in the file licence-lgpl-2.1.txt
|
15 |
// or available at http://www.gnu.org/licenses/lgpl.txt
|
16 |
//
|
17 |
// This library is distributed on an "AS IS" basis,
|
18 |
// WITHOUT WARRANTY OF ANY KIND, either express or implied.
|
19 |
// See the individual licence texts for more details.
|
20 |
|
21 |
package net.htmlparser.jericho;
|
22 |
|
23 |
import java.util.*;
|
24 |
|
25 |
/**
|
26 |
* Encapsulates global configuration properties which determine the behaviour of various functions.
|
27 |
* <p>
|
28 |
* All of the properties in this class are static, affecting all objects and threads.
|
29 |
* Multiple concurrent configurations are not possible.
|
30 |
* <p>
|
31 |
* Properties that relate to <a target="_blank" href="http://www.w3.org/TR/html401/conform.html#didx-user_agent">user agent</a>
|
32 |
* compatibility issues are stored in instances of the {@link Config.CompatibilityMode} class.
|
33 |
* This allows all of the properties in the compatibility mode to be set as a block by setting the static
|
34 |
* {@link #CurrentCompatibilityMode} property to a different instance.
|
35 |
*
|
36 |
* @see Config.CompatibilityMode
|
37 |
*/
|
38 |
public final class Config {
|
39 |
private Config() {}
|
40 |
|
41 |
/**
|
42 |
* Determines the string used to separate a single column's multiple values in the output of the {@link FormFields#getColumnValues(Map)} method.
|
43 |
* <p>
|
44 |
* The situation where a single column has multiple values only arises if {@link FormField#getUserValueCount()}<code>>1</code>
|
45 |
* on the relevant form field, which usually indicates a poorly designed form.
|
46 |
* <p>
|
47 |
* The default value is "<code>,</code>" (a comma, not including the quotes).
|
48 |
* <p>
|
49 |
* Must not be <code>null</code>.
|
50 |
*/
|
51 |
public static String ColumnMultipleValueSeparator=",";
|
52 |
|
53 |
/**
|
54 |
* Determines the string that represents the value <code>true</code> in the output of the {@link FormFields#getColumnValues(Map)} method.
|
55 |
* <p>
|
56 |
* The default value is "<code>true</code>" (without the quotes).
|
57 |
* <p>
|
58 |
* Must not be <code>null</code>.
|
59 |
*/
|
60 |
public static String ColumnValueTrue=Boolean.toString(true);
|
61 |
|
62 |
/**
|
63 |
* Determines the string that represents the value <code>false</code> in the output of the {@link FormFields#getColumnValues(Map)} method.
|
64 |
* <p>
|
65 |
* The default value is <code>null</code>, which represents no output at all.
|
66 |
*/
|
67 |
public static String ColumnValueFalse=null;
|
68 |
|
69 |
/**
|
70 |
* Determines whether the {@link CharacterReference#decode(CharSequence)} and similar methods convert non-breaking space ({@link CharacterEntityReference#_nbsp &nbsp;}) character references to normal spaces.
|
71 |
* <p>
|
72 |
* The default value is <code>true</code>.
|
73 |
* <p>
|
74 |
* When this property is set to <code>false</code>, non-breaking space ({@link CharacterEntityReference#_nbsp &nbsp;})
|
75 |
* character references are decoded as non-breaking space characters (U+00A0) instead of being converted to normal spaces (U+0020).
|
76 |
* <p>
|
77 |
* The default behaviour of the library reflects the fact that non-breaking space character references are almost always used in HTML documents
|
78 |
* as a <a target="_blank" href="http://en.wikipedia.org/wiki/Non-breaking_space#Use_as_non-collapsing_whitespace">non-collapsing white space</a> character.
|
79 |
* Converting them to the correct character code U+00A0, which is represented by a visible character in many older character sets, was confusing to most users
|
80 |
* who expected to see only normal spaces.
|
81 |
* The most common example of this is its visualisation as the character <b>á</b> in the MS-DOS <a target="_blank" href="http://en.wikipedia.org/wiki/Code_page_437">CP437</a> character set.
|
82 |
* <p>
|
83 |
* The functionality of the following methods is affected:
|
84 |
* <ul>
|
85 |
* <li>{@link CharacterReference#appendCharTo(Appendable)}
|
86 |
* <li>{@link CharacterReference#decode(CharSequence)}
|
87 |
* <li>{@link CharacterReference#decode(CharSequence, boolean insideAttributeValue)}
|
88 |
* <li>{@link CharacterReference#decodeCollapseWhiteSpace(CharSequence)}
|
89 |
* <li>{@link CharacterReference#reencode(CharSequence)}
|
90 |
* <li>{@link Attribute#getValue()}
|
91 |
* <li>{@link Attributes#getValue(String name)}
|
92 |
* <li>{@link Attributes#populateMap(Map, boolean convertNamesToLowerCase)}
|
93 |
* <li>{@link StartTag#getAttributeValue(String attributeName)}
|
94 |
* <li>{@link Element#getAttributeValue(String attributeName)}
|
95 |
* <li>{@link FormControl#getPredefinedValues()}
|
96 |
* <li>{@link OutputDocument#replace(Attributes, boolean convertNamesToLowerCase)}
|
97 |
* <li>{@link Renderer#getConvertNonBreakingSpaces()}
|
98 |
* <li>{@link TextExtractor#getConvertNonBreakingSpaces()}
|
99 |
* </ul>
|
100 |
*/
|
101 |
public static boolean ConvertNonBreakingSpaces=true;
|
102 |
|
103 |
|
104 |
/**
|
105 |
* Determines the currently active {@linkplain Config.CompatibilityMode compatibility mode}.
|
106 |
* <p>
|
107 |
* The default setting is {@link Config.CompatibilityMode#IE} (MS Internet Explorer 6.0).
|
108 |
* <p>
|
109 |
* Must not be <code>null</code>.
|
110 |
*/
|
111 |
public static CompatibilityMode CurrentCompatibilityMode=CompatibilityMode.IE;
|
112 |
|
113 |
/**
|
114 |
* Determines whether apostrophes are encoded when calling the {@link CharacterReference#encode(CharSequence)} method.
|
115 |
* <p>
|
116 |
* A value of <code>false</code> means {@linkplain CharacterEntityReference#_apos apostrophe}
|
117 |
* (U+0027) characters are not encoded.
|
118 |
* The only time apostrophes need to be encoded is within an attribute value delimited by
|
119 |
* single quotes (apostrophes), so in most cases ignoring apostrophes is perfectly safe and
|
120 |
* enhances the readability of the source document.
|
121 |
* <p>
|
122 |
* Note that apostrophes are always encoded as a {@linkplain NumericCharacterReference numeric character reference}, never as the
|
123 |
* character entity reference {@link CharacterEntityReference#_apos &apos;}.
|
124 |
* <p>
|
125 |
* The default value is <code>false</code>.
|
126 |
*/
|
127 |
public static boolean IsApostropheEncoded=false;
|
128 |
|
129 |
/**
|
130 |
* Determines whether all {@linkplain StartTag#isEmptyElementTag() empty-element tags} are recognised.
|
131 |
* <p>
|
132 |
* The major browsers do not recognise empty-element tags (those having the characters "/>" at the end of the start tag) if the element is defined by the
|
133 |
* HTML specification to have a {@linkplain HTMLElements#getEndTagRequiredElementNames() required} or an {@linkplain HTMLElements#getEndTagOptionalElementNames() optional} end tag.
|
134 |
* This is the case even in <a target="_blank" href="http://www.w3.org/TR/xhtml1/">XHTML</a> documents, which can cause a lot of confusion.
|
135 |
* <p>
|
136 |
* Setting this property to <code>true</code> forces the parser to recognise all {@linkplain StartTag#isSyntacticalEmptyElementTag() syntactical empty-element tags},
|
137 |
* regardless of whether the element is defined by the HTML specification to have a required or optional end tag.
|
138 |
* <p>
|
139 |
* Use of this feature is however not recommended as it makes the parser behaviour inconsistent with that of most browsers.
|
140 |
* <p>
|
141 |
* The default value is <code>false</code>.
|
142 |
*
|
143 |
* @see StartTag#isEmptyElementTag()
|
144 |
*/
|
145 |
public static boolean IsHTMLEmptyElementTagRecognised=false;
|
146 |
|
147 |
/**
|
148 |
* Determines the {@link LoggerProvider} that is used to create the default {@link Logger} object for each new {@link Source} object.
|
149 |
* <p>
|
150 |
* The {@link LoggerProvider} interface contains several predefined <code>LoggerProvider</code> instances which this property can be set to,
|
151 |
* mostly representing wrappers to common logging frameworks.
|
152 |
* <p>
|
153 |
* The default value is <code>null</code>, which results in the auto-detection of the most appropriate logging mechanism according to the following algorithm:
|
154 |
* <p>
|
155 |
* <ol>
|
156 |
* <li>If the class <code>org.slf4j.impl.StaticLoggerBinder</code> is detected:
|
157 |
* <ul>
|
158 |
* <li>If the class <code>org.slf4j.impl.JDK14LoggerFactory</code> is detected, use {@link LoggerProvider#JAVA}.
|
159 |
* <li>If the class <code>org.slf4j.impl.Log4jLoggerFactory</code> is detected, use {@link LoggerProvider#LOG4J}.
|
160 |
* <li>If the class <code>org.slf4j.impl.JCLLoggerFactory</code> is NOT detected, use {@link LoggerProvider#SLF4J}.
|
161 |
* </ul>
|
162 |
* <li>If the class <code>org.apache.commons.logging.Log</code> is detected:
|
163 |
* <blockquote>
|
164 |
* Create an instance of it using the commons-logging <code>LogFactory</code> class.
|
165 |
* <ul>
|
166 |
* <li>If the created <code>Log</code> is of type <code>org.apache.commons.logging.impl.Jdk14Logger</code>, use {@link LoggerProvider#JAVA}.
|
167 |
* <li>If the created <code>Log</code> is of type <code>org.apache.commons.logging.impl.Log4JLogger</code>, use {@link LoggerProvider#LOG4J}.
|
168 |
* <li>otherwise, use {@link LoggerProvider#JCL}.
|
169 |
* </ul>
|
170 |
* </blockquote>
|
171 |
* <li>If the class <code>org.apache.log4j.Logger</code> is detected, use {@link LoggerProvider#LOG4J}.
|
172 |
* <li>otherwise, use {@link LoggerProvider#JAVA}.
|
173 |
* </ol>
|
174 |
*
|
175 |
* @see Source#setLogger(Logger)
|
176 |
*/
|
177 |
public static LoggerProvider LoggerProvider=null;
|
178 |
|
179 |
/**
|
180 |
* Determines the string used to represent a <a target="_blank" href="http://en.wikipedia.org/wiki/Newline">newline</a> in text output throughout the library.
|
181 |
* <p>
|
182 |
* The default value is the standard new line character sequence of the host platform, determined by <code>System.getProperty("line.separator")</code>.
|
183 |
*/
|
184 |
public static String NewLine=System.getProperty("line.separator");
|
185 |
|
186 |
/**
|
187 |
* Used in Element.getChildElements.
|
188 |
* Server elements containing markup should be included in the hierarchy, so consider making this option public in future.
|
189 |
*/
|
190 |
static final boolean IncludeServerTagsInElementHierarchy=false;
|
191 |
|
192 |
/**
|
193 |
* Represents a set of maximum unicode code points to be recognised for the three types of
|
194 |
* <a href="CharacterReference.html#Unterminated">unterminated</a> character reference in a given context.
|
195 |
* <p>
|
196 |
* The three types of character reference are:
|
197 |
* <ul>
|
198 |
* <li>{@linkplain CharacterEntityReference Character entity reference}
|
199 |
* <li><a href="NumericCharacterReference.html#DecimalCharacterReference">Decimal character reference</a>
|
200 |
* <li><a href="NumericCharacterReference.html#HexadecimalCharacterReference">Hexadecimal character reference</a>
|
201 |
* </ul>
|
202 |
* <p>
|
203 |
* The two types of contexts used in this library are:
|
204 |
* <ul>
|
205 |
* <li>Inside an attribute value
|
206 |
* <li>Outside an attribute value
|
207 |
* </ul>
|
208 |
*/
|
209 |
static class UnterminatedCharacterReferenceSettings {
|
210 |
// use volatile fields to make them thread safe
|
211 |
public volatile int characterEntityReferenceMaxCodePoint;
|
212 |
public volatile int decimalCharacterReferenceMaxCodePoint;
|
213 |
public volatile int hexadecimalCharacterReferenceMaxCodePoint;
|
214 |
|
215 |
public static UnterminatedCharacterReferenceSettings ACCEPT_ALL=new UnterminatedCharacterReferenceSettings(CompatibilityMode.CODE_POINTS_ALL,CompatibilityMode.CODE_POINTS_ALL,CompatibilityMode.CODE_POINTS_ALL);
|
216 |
|
217 |
public UnterminatedCharacterReferenceSettings() {
|
218 |
this(CompatibilityMode.CODE_POINTS_NONE,CompatibilityMode.CODE_POINTS_NONE,CompatibilityMode.CODE_POINTS_NONE);
|
219 |
}
|
220 |
|
221 |
public UnterminatedCharacterReferenceSettings(final int characterEntityReferenceMaxCodePoint, final int decimalCharacterReferenceMaxCodePoint, final int hexadecimalCharacterReferenceMaxCodePoint) {
|
222 |
this.characterEntityReferenceMaxCodePoint=characterEntityReferenceMaxCodePoint;
|
223 |
this.decimalCharacterReferenceMaxCodePoint=decimalCharacterReferenceMaxCodePoint;
|
224 |
this.hexadecimalCharacterReferenceMaxCodePoint=hexadecimalCharacterReferenceMaxCodePoint;
|
225 |
}
|
226 |
|
227 |
public String toString() {
|
228 |
return Config.NewLine+" Character entity reference: "+getDescription(characterEntityReferenceMaxCodePoint)
|
229 |
+Config.NewLine+" Decimal character reference: "+getDescription(decimalCharacterReferenceMaxCodePoint)
|
230 |
+Config.NewLine+" Haxadecimal character reference: "+getDescription(hexadecimalCharacterReferenceMaxCodePoint);
|
231 |
}
|
232 |
|
233 |
private String getDescription(final int codePoint) {
|
234 |
if (codePoint==CompatibilityMode.CODE_POINTS_NONE) return "None";
|
235 |
if (codePoint==CompatibilityMode.CODE_POINTS_ALL) return "All";
|
236 |
return "0x"+Integer.toString(codePoint,16);
|
237 |
}
|
238 |
}
|
239 |
|
240 |
/**
|
241 |
* Represents a set of configuration parameters that relate to
|
242 |
* <a target="_blank" href="http://www.w3.org/TR/html401/conform.html#didx-user_agent">user agent</a> compatibility issues.
|
243 |
* <p>
|
244 |
* The predefined compatibility modes {@link #IE}, {@link #MOZILLA}, {@link #OPERA} and {@link #XHTML} provide an easy means of
|
245 |
* ensuring the library interprets the markup in a way consistent with some of the most commonly used browsers,
|
246 |
* at least in relation to the behaviour described by the properties in this class.
|
247 |
* <p>
|
248 |
* The properties of any <code>CompatibilityMode</code> object can be modified individually, including those in
|
249 |
* the predefined instances as well as newly constructed instances.
|
250 |
* Take note however that modifying the properties of the predefined instances has a global affect.
|
251 |
* <p>
|
252 |
* The currently active compatibility mode is stored in the static {@link Config#CurrentCompatibilityMode} property.
|
253 |
* <p>
|
254 |
*/
|
255 |
public static final class CompatibilityMode {
|
256 |
private String name;
|
257 |
private volatile boolean formFieldNameCaseInsensitive;
|
258 |
volatile UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettingsInsideAttributeValue;
|
259 |
volatile UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettingsOutsideAttributeValue;
|
260 |
|
261 |
/**
|
262 |
* Indicates the recognition of all unicode code points.
|
263 |
* <p>
|
264 |
* This value is used in properties which specify a maximum unicode code point to be recognised by the parser.
|
265 |
*
|
266 |
* @see #getUnterminatedCharacterEntityReferenceMaxCodePoint(boolean insideAttributeValue)
|
267 |
* @see #getUnterminatedDecimalCharacterReferenceMaxCodePoint(boolean insideAttributeValue)
|
268 |
* @see #getUnterminatedHexadecimalCharacterReferenceMaxCodePoint(boolean insideAttributeValue)
|
269 |
*/
|
270 |
public static final int CODE_POINTS_ALL=Character.MAX_CODE_POINT; // 0x10FFFF (decimal 1114111)
|
271 |
|
272 |
/**
|
273 |
* Indicates the recognition of no unicode code points.
|
274 |
* <p>
|
275 |
* This value is used in properties which specify a maximum unicode code point to be recognised by the parser.
|
276 |
*
|
277 |
* @see #getUnterminatedCharacterEntityReferenceMaxCodePoint(boolean insideAttributeValue)
|
278 |
* @see #getUnterminatedDecimalCharacterReferenceMaxCodePoint(boolean insideAttributeValue)
|
279 |
* @see #getUnterminatedHexadecimalCharacterReferenceMaxCodePoint(boolean insideAttributeValue)
|
280 |
*/
|
281 |
public static final int CODE_POINTS_NONE=CharacterReference.INVALID_CODE_POINT;
|
282 |
|
283 |
/**
|
284 |
* <a target="_blank" href="http://www.microsoft.com/windows/ie/">Microsoft Internet Explorer</a> compatibility mode.
|
285 |
* <p>
|
286 |
* <code>{@link #getName() Name} = IE</code><br />
|
287 |
* <code>{@link #isFormFieldNameCaseInsensitive() FormFieldNameCaseInsensitive} = true</code><br />
|
288 |
* <table cellspacing="0" cellpadding="0">
|
289 |
* <tr><th>Recognition of unterminated character references:<th><th align="center"> (inside attribute) <th align="center"> (outside attribute)
|
290 |
* <tr><td>{@link #getUnterminatedCharacterEntityReferenceMaxCodePoint(boolean) UnterminatedCharacterEntityReferenceMaxCodePoint}<td><code> =</code><td align="center">U+00FF<td align="center">U+00FF
|
291 |
* <tr><td>{@link #getUnterminatedDecimalCharacterReferenceMaxCodePoint(boolean) UnterminatedDecimalCharacterReferenceMaxCodePoint}<td><code> =</code><td align="center">{@linkplain #CODE_POINTS_ALL All}<td align="center">{@linkplain #CODE_POINTS_ALL All}
|
292 |
* <tr><td>{@link #getUnterminatedHexadecimalCharacterReferenceMaxCodePoint(boolean) UnterminatedHexadecimalCharacterReferenceMaxCodePoint}<td><code> =</code><td align="center">{@linkplain #CODE_POINTS_ALL All}<td align="center">{@linkplain #CODE_POINTS_NONE None}
|
293 |
* </table>
|
294 |
*/
|
295 |
public static final CompatibilityMode IE=new CompatibilityMode("IE",true,
|
296 |
new UnterminatedCharacterReferenceSettings(0xFF, CODE_POINTS_ALL, CODE_POINTS_ALL), // inside attributes
|
297 |
new UnterminatedCharacterReferenceSettings(0xFF, CODE_POINTS_ALL, CODE_POINTS_NONE) // outside attributes
|
298 |
);
|
299 |
|
300 |
/**
|
301 |
* <a target="_blank" href="http://www.mozilla.org/products/mozilla1.x/">Mozilla</a> /
|
302 |
* <a target="_blank" href="http://www.mozilla.org/products/firefox/">Firefox</a> /
|
303 |
* <a target="_blank" href="http://browser.netscape.com/">Netscape</a> compatibility mode.
|
304 |
* <p>
|
305 |
* <code>{@link #getName() Name} = Mozilla</code><br />
|
306 |
* <code>{@link #isFormFieldNameCaseInsensitive() FormFieldNameCaseInsensitive} = false</code><br />
|
307 |
* <table cellspacing="0" cellpadding="0">
|
308 |
* <tr><th>Recognition of unterminated character references:<th><th align="center"> (inside attribute) <th align="center"> (outside attribute)
|
309 |
* <tr><td>{@link #getUnterminatedCharacterEntityReferenceMaxCodePoint(boolean) UnterminatedCharacterEntityReferenceMaxCodePoint}<td><code> =</code><td align="center">U+00FF<td align="center">{@linkplain #CODE_POINTS_ALL All}
|
310 |
* <tr><td>{@link #getUnterminatedDecimalCharacterReferenceMaxCodePoint(boolean) UnterminatedDecimalCharacterReferenceMaxCodePoint}<td><code> =</code><td align="center">{@linkplain #CODE_POINTS_ALL All}<td align="center">{@linkplain #CODE_POINTS_ALL All}
|
311 |
* <tr><td>{@link #getUnterminatedHexadecimalCharacterReferenceMaxCodePoint(boolean) UnterminatedHexadecimalCharacterReferenceMaxCodePoint}<td><code> =</code><td align="center">{@linkplain #CODE_POINTS_ALL All}<td align="center">{@linkplain #CODE_POINTS_ALL All}
|
312 |
* </table>
|
313 |
*/
|
314 |
public static final CompatibilityMode MOZILLA=new CompatibilityMode("Mozilla",false,
|
315 |
new UnterminatedCharacterReferenceSettings(0xFF, CODE_POINTS_ALL, CODE_POINTS_ALL), // inside attributes
|
316 |
new UnterminatedCharacterReferenceSettings(CODE_POINTS_ALL, CODE_POINTS_ALL, CODE_POINTS_ALL) // outside attributes
|
317 |
);
|
318 |
|
319 |
/**
|
320 |
* Opera compatibility mode.
|
321 |
* <p>
|
322 |
* <code>{@link #getName() Name} = Opera</code><br />
|
323 |
* <code>{@link #isFormFieldNameCaseInsensitive() FormFieldNameCaseInsensitive} = true</code><br />
|
324 |
* <table cellspacing="0" cellpadding="0">
|
325 |
* <tr><th>Recognition of unterminated character references:<th><th align="center"> (inside attribute) <th align="center"> (outside attribute)
|
326 |
* <tr><td>{@link #getUnterminatedCharacterEntityReferenceMaxCodePoint(boolean) UnterminatedCharacterEntityReferenceMaxCodePoint}<td><code> =</code><td align="center">U+003E<td align="center">{@linkplain #CODE_POINTS_ALL All}
|
327 |
* <tr><td>{@link #getUnterminatedDecimalCharacterReferenceMaxCodePoint(boolean) UnterminatedDecimalCharacterReferenceMaxCodePoint}<td><code> =</code><td align="center">{@linkplain #CODE_POINTS_ALL All}<td align="center">{@linkplain #CODE_POINTS_ALL All}
|
328 |
* <tr><td>{@link #getUnterminatedHexadecimalCharacterReferenceMaxCodePoint(boolean) UnterminatedHexadecimalCharacterReferenceMaxCodePoint}<td><code> =</code><td align="center">{@linkplain #CODE_POINTS_ALL All}<td align="center">{@linkplain #CODE_POINTS_ALL All}
|
329 |
* </table>
|
330 |
*/
|
331 |
public static final CompatibilityMode OPERA=new CompatibilityMode("Opera",true,
|
332 |
new UnterminatedCharacterReferenceSettings(0x3E, CODE_POINTS_ALL, CODE_POINTS_ALL), // inside attributes
|
333 |
new UnterminatedCharacterReferenceSettings(CODE_POINTS_ALL, CODE_POINTS_ALL, CODE_POINTS_ALL) // outside attributes
|
334 |
);
|
335 |
|
336 |
/**
|
337 |
* <a target="_blank" href="http://www.w3.org/TR/xhtml1/#xhtml">XHTML</a> compatibility mode.
|
338 |
* <p>
|
339 |
* <code>{@link #getName() Name} = XHTML</code><br />
|
340 |
* <code>{@link #isFormFieldNameCaseInsensitive() FormFieldNameCaseInsensitive} = false</code><br />
|
341 |
* <table cellspacing="0" cellpadding="0">
|
342 |
* <tr><th>Recognition of unterminated character references:<th><th align="center"> (inside attribute) <th align="center"> (outside attribute)
|
343 |
* <tr><td>{@link #getUnterminatedCharacterEntityReferenceMaxCodePoint(boolean) UnterminatedCharacterEntityReferenceMaxCodePoint}<td><code> =</code><td align="center">{@linkplain #CODE_POINTS_NONE None}<td align="center">{@linkplain #CODE_POINTS_NONE None}
|
344 |
* <tr><td>{@link #getUnterminatedDecimalCharacterReferenceMaxCodePoint(boolean) UnterminatedDecimalCharacterReferenceMaxCodePoint}<td><code> =</code><td align="center">{@linkplain #CODE_POINTS_NONE None}<td align="center">{@linkplain #CODE_POINTS_NONE None}
|
345 |
* <tr><td>{@link #getUnterminatedHexadecimalCharacterReferenceMaxCodePoint(boolean) UnterminatedHexadecimalCharacterReferenceMaxCodePoint}<td><code> =</code><td align="center">{@linkplain #CODE_POINTS_NONE None}<td align="center">{@linkplain #CODE_POINTS_NONE None}
|
346 |
* </table>
|
347 |
*/
|
348 |
public static final CompatibilityMode XHTML=new CompatibilityMode("XHTML");
|
349 |
|
350 |
/**
|
351 |
* Constructs a new <code>CompatibilityMode</code> with the given {@linkplain #getName() name}.
|
352 |
* <p>
|
353 |
* All properties in the new instance are initially assigned their default values, which are the same as the strict
|
354 |
* rules of the {@link #XHTML} compatibility mode.
|
355 |
*
|
356 |
* @param name the {@linkplain #getName() name} of the new compatibility mode
|
357 |
*/
|
358 |
public CompatibilityMode(final String name) {
|
359 |
this(name,false,new UnterminatedCharacterReferenceSettings(),new UnterminatedCharacterReferenceSettings());
|
360 |
}
|
361 |
|
362 |
private CompatibilityMode(final String name, final boolean formFieldNameCaseInsensitive, final UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettingsInsideAttributeValue, final UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettingsOutsideAttributeValue) {
|
363 |
this.name=name;
|
364 |
this.formFieldNameCaseInsensitive=formFieldNameCaseInsensitive;
|
365 |
this.unterminatedCharacterReferenceSettingsInsideAttributeValue=unterminatedCharacterReferenceSettingsInsideAttributeValue;
|
366 |
this.unterminatedCharacterReferenceSettingsOutsideAttributeValue=unterminatedCharacterReferenceSettingsOutsideAttributeValue;
|
367 |
}
|
368 |
|
369 |
/**
|
370 |
* Returns the name of this compatibility mode.
|
371 |
* @return the name of this compatibility mode.
|
372 |
*/
|
373 |
public String getName() {
|
374 |
return name;
|
375 |
}
|
376 |
|
377 |
/**
|
378 |
* Indicates whether {@linkplain FormField#getName() form field names} are treated as case insensitive.
|
379 |
* <p>
|
380 |
* Microsoft Internet Explorer treats field names as case insensitive,
|
381 |
* while Mozilla treats them as case sensitive.
|
382 |
* <p>
|
383 |
* The value of this property in the {@linkplain Config#CurrentCompatibilityMode current compatibility mode}
|
384 |
* affects all instances of the {@link FormFields} class.
|
385 |
* It should be set to the desired configuration before any instances of <code>FormFields</code> are created.
|
386 |
*
|
387 |
* @return <code>true</code> if {@linkplain FormField#getName() form field names} are treated as case insensitive, otherwise <code>false</code>.
|
388 |
* @see #setFormFieldNameCaseInsensitive(boolean)
|
389 |
*/
|
390 |
public boolean isFormFieldNameCaseInsensitive() {
|
391 |
return formFieldNameCaseInsensitive;
|
392 |
}
|
393 |
|
394 |
/**
|
395 |
* Sets whether {@linkplain FormField#getName() form field names} are treated as case insensitive.
|
396 |
* <p>
|
397 |
* See {@link #isFormFieldNameCaseInsensitive()} for the documentation of this property.
|
398 |
*
|
399 |
* @param value the new value of the property
|
400 |
*/
|
401 |
public void setFormFieldNameCaseInsensitive(final boolean value) {
|
402 |
formFieldNameCaseInsensitive=value;
|
403 |
}
|
404 |
|
405 |
/**
|
406 |
* Returns the maximum unicode code point of an <a href="CharacterReference.html#Unterminated">unterminated</a>
|
407 |
* {@linkplain CharacterEntityReference character entity reference} which is to be recognised in the specified context.
|
408 |
* <p>
|
409 |
* For example, if <code>getUnterminatedCharacterEntityReferenceMaxCodePoint(true)</code> has the value <code>0xFF</code> (U+00FF)
|
410 |
* in the {@linkplain Config#CurrentCompatibilityMode current compatibility mode}, then:
|
411 |
* <ul>
|
412 |
* <li>{@link CharacterReference#decode(CharSequence,boolean) CharacterReference.decode("&gt",true)}
|
413 |
* returns "<code>></code>".<br />
|
414 |
* The string is recognised as the character entity reference {@link CharacterEntityReference#_gt &gt;}
|
415 |
* despite the fact that it is <a href="CharacterReference.html#Unterminated">unterminated</a>,
|
416 |
* because its unicode code point U+003E is below the maximum of U+00FF set by this property.
|
417 |
* <li>{@link CharacterReference#decode(CharSequence,boolean) CharacterReference.decode("&euro",true)}
|
418 |
* returns "<code>&euro</code>".<br />
|
419 |
* The string is not recognised as the character entity reference {@link CharacterEntityReference#_euro &euro;}
|
420 |
* because it is <a href="CharacterReference.html#Unterminated">unterminated</a>
|
421 |
* and its unicode code point U+20AC is above the maximum of U+00FF set by this property.
|
422 |
* </ul>
|
423 |
* <p>
|
424 |
* See the documentation of the {@link Attribute#getValue()} method for further discussion.
|
425 |
*
|
426 |
* @param insideAttributeValue the context within an HTML document - <code>true</code> if inside an attribute value or <code>false</code> if outside an attribute value.
|
427 |
* @return the maximum unicode code point of an <a href="CharacterReference.html#Unterminated">unterminated</a> {@linkplain CharacterEntityReference character entity reference} which is to be recognised in the specified context.
|
428 |
* @see #setUnterminatedCharacterEntityReferenceMaxCodePoint(boolean insideAttributeValue, int maxCodePoint)
|
429 |
*/
|
430 |
public int getUnterminatedCharacterEntityReferenceMaxCodePoint(final boolean insideAttributeValue) {
|
431 |
return getUnterminatedCharacterReferenceSettings(insideAttributeValue).characterEntityReferenceMaxCodePoint;
|
432 |
}
|
433 |
|
434 |
/**
|
435 |
* Sets the maximum unicode code point of an <a href="CharacterReference.html#Unterminated">unterminated</a>
|
436 |
* {@linkplain CharacterEntityReference character entity reference} which is to be recognised in the specified context.
|
437 |
* <p>
|
438 |
* See {@link #getUnterminatedCharacterEntityReferenceMaxCodePoint(boolean insideAttributeValue)} for the documentation of this property.
|
439 |
*
|
440 |
* @param insideAttributeValue the context within an HTML document - <code>true</code> if inside an attribute value or <code>false</code> if outside an attribute value.
|
441 |
* @param maxCodePoint the maximum unicode code point.
|
442 |
*/
|
443 |
public void setUnterminatedCharacterEntityReferenceMaxCodePoint(final boolean insideAttributeValue, final int maxCodePoint) {
|
444 |
getUnterminatedCharacterReferenceSettings(insideAttributeValue).characterEntityReferenceMaxCodePoint=maxCodePoint;
|
445 |
}
|
446 |
|
447 |
/**
|
448 |
* Returns the maximum unicode code point of an <a href="CharacterReference.html#Unterminated">unterminated</a>
|
449 |
* <a href="NumericCharacterReference.html#DecimalCharacterReference">decimal character reference</a> which is to be recognised in the specified context.
|
450 |
* <p>
|
451 |
* For example, if <code>getUnterminatedDecimalCharacterReferenceMaxCodePoint(true)</code> had the hypothetical value <code>0xFF</code> (U+00FF)
|
452 |
* in the {@linkplain Config#CurrentCompatibilityMode current compatibility mode}, then:
|
453 |
* <ul>
|
454 |
* <li>{@link CharacterReference#decode(CharSequence,boolean) CharacterReference.decode("&#62",true)}
|
455 |
* returns "<code>></code>".<br />
|
456 |
* The string is recognised as the numeric character reference <code>&#62;</code>
|
457 |
* despite the fact that it is <a href="CharacterReference.html#Unterminated">unterminated</a>,
|
458 |
* because its unicode code point U+003E is below the maximum of U+00FF set by this property.
|
459 |
* <li>{@link CharacterReference#decode(CharSequence,boolean) CharacterReference.decode("&#8364",true)}
|
460 |
* returns "<code>&#8364</code>".<br />
|
461 |
* The string is not recognised as the numeric character reference <code>&#8364;</code>
|
462 |
* because it is <a href="CharacterReference.html#Unterminated">unterminated</a>
|
463 |
* and its unicode code point U+20AC is above the maximum of U+00FF set by this property.
|
464 |
* </ul>
|
465 |
*
|
466 |
* @param insideAttributeValue the context within an HTML document - <code>true</code> if inside an attribute value or <code>false</code> if outside an attribute value.
|
467 |
* @return the maximum unicode code point of an <a href="CharacterReference.html#Unterminated">unterminated</a> <a href="NumericCharacterReference.html#DecimalCharacterReference">decimal character reference</a> which is to be recognised in the specified context.
|
468 |
* @see #setUnterminatedDecimalCharacterReferenceMaxCodePoint(boolean insideAttributeValue, int maxCodePoint)
|
469 |
*/
|
470 |
public int getUnterminatedDecimalCharacterReferenceMaxCodePoint(final boolean insideAttributeValue) {
|
471 |
return getUnterminatedCharacterReferenceSettings(insideAttributeValue).decimalCharacterReferenceMaxCodePoint;
|
472 |
}
|
473 |
|
474 |
/**
|
475 |
* Sets the maximum unicode code point of an <a href="CharacterReference.html#Unterminated">unterminated</a>
|
476 |
* <a href="NumericCharacterReference.html#DecimalCharacterReference">decimal character reference</a> which is to be recognised in the specified context.
|
477 |
* <p>
|
478 |
* See {@link #getUnterminatedDecimalCharacterReferenceMaxCodePoint(boolean insideAttributeValue)} for the documentation of this property.
|
479 |
*
|
480 |
* @param insideAttributeValue the context within an HTML document - <code>true</code> if inside an attribute value or <code>false</code> if outside an attribute value.
|
481 |
* @param maxCodePoint the maximum unicode code point.
|
482 |
*/
|
483 |
public void setUnterminatedDecimalCharacterReferenceMaxCodePoint(final boolean insideAttributeValue, final int maxCodePoint) {
|
484 |
getUnterminatedCharacterReferenceSettings(insideAttributeValue).decimalCharacterReferenceMaxCodePoint=maxCodePoint;
|
485 |
}
|
486 |
|
487 |
/**
|
488 |
* Returns the maximum unicode code point of an <a href="CharacterReference.html#Unterminated">unterminated</a>
|
489 |
* <a href="NumericCharacterReference.html#HexadecimalCharacterReference">hexadecimal character reference</a> which is to be recognised in the specified context.
|
490 |
* <p>
|
491 |
* For example, if <code>getUnterminatedHexadecimalCharacterReferenceMaxCodePoint(true)</code> had the hypothetical value <code>0xFF</code> (U+00FF)
|
492 |
* in the {@linkplain Config#CurrentCompatibilityMode current compatibility mode}, then:
|
493 |
* <ul>
|
494 |
* <li>{@link CharacterReference#decode(CharSequence,boolean) CharacterReference.decode("&#x3e",true)}
|
495 |
* returns "<code>></code>".<br />
|
496 |
* The string is recognised as the numeric character reference <code>&#x3e;</code>
|
497 |
* despite the fact that it is <a href="CharacterReference.html#Unterminated">unterminated</a>,
|
498 |
* because its unicode code point U+003E is below the maximum of U+00FF set by this property.
|
499 |
* <li>{@link CharacterReference#decode(CharSequence,boolean) CharacterReference.decode("&#x20ac",true)}
|
500 |
* returns "<code>&#x20ac</code>".<br />
|
501 |
* The string is not recognised as the numeric character reference <code>&#20ac;</code>
|
502 |
* because it is <a href="CharacterReference.html#Unterminated">unterminated</a>
|
503 |
* and its unicode code point U+20AC is above the maximum of U+00FF set by this property.
|
504 |
* </ul>
|
505 |
*
|
506 |
* @param insideAttributeValue the context within an HTML document - <code>true</code> if inside an attribute value or <code>false</code> if outside an attribute value.
|
507 |
* @return the maximum unicode code point of an <a href="CharacterReference.html#Unterminated">unterminated</a> <a href="NumericCharacterReference.html#HexadecimalCharacterReference">hexadecimal character reference</a> which is to be recognised in the specified context.
|
508 |
* @see #setUnterminatedHexadecimalCharacterReferenceMaxCodePoint(boolean insideAttributeValue, int maxCodePoint)
|
509 |
*/
|
510 |
public int getUnterminatedHexadecimalCharacterReferenceMaxCodePoint(final boolean insideAttributeValue) {
|
511 |
return getUnterminatedCharacterReferenceSettings(insideAttributeValue).hexadecimalCharacterReferenceMaxCodePoint;
|
512 |
}
|
513 |
|
514 |
/**
|
515 |
* Sets the maximum unicode code point of an <a href="CharacterReference.html#Unterminated">unterminated</a>
|
516 |
* <a href="NumericCharacterReference.html#HexadecimalCharacterReference">headecimal character reference</a> which is to be recognised in the specified context.
|
517 |
* <p>
|
518 |
* See {@link #getUnterminatedHexadecimalCharacterReferenceMaxCodePoint(boolean insideAttributeValue)} for the documentation of this property.
|
519 |
*
|
520 |
* @param insideAttributeValue the context within an HTML document - <code>true</code> if inside an attribute value or <code>false</code> if outside an attribute value.
|
521 |
* @param maxCodePoint the maximum unicode code point.
|
522 |
*/
|
523 |
public void setUnterminatedHexadecimalCharacterReferenceMaxCodePoint(final boolean insideAttributeValue, final int maxCodePoint) {
|
524 |
getUnterminatedCharacterReferenceSettings(insideAttributeValue).hexadecimalCharacterReferenceMaxCodePoint=maxCodePoint;
|
525 |
}
|
526 |
|
527 |
/**
|
528 |
* Returns a string representation of this object useful for debugging purposes.
|
529 |
* @return a string representation of this object useful for debugging purposes.
|
530 |
*/
|
531 |
public String getDebugInfo() {
|
532 |
return "Form field name case insensitive: "+formFieldNameCaseInsensitive
|
533 |
+Config.NewLine+"Maximum codepoints in unterminated character references:"
|
534 |
+Config.NewLine+" Inside attribute values:"
|
535 |
+unterminatedCharacterReferenceSettingsInsideAttributeValue
|
536 |
+Config.NewLine+" Outside attribute values:"
|
537 |
+unterminatedCharacterReferenceSettingsOutsideAttributeValue;
|
538 |
}
|
539 |
|
540 |
/**
|
541 |
* Returns the {@linkplain #getName() name} of this compatibility mode.
|
542 |
* @return the {@linkplain #getName() name} of this compatibility mode.
|
543 |
*/
|
544 |
public String toString() {
|
545 |
return getName();
|
546 |
}
|
547 |
|
548 |
UnterminatedCharacterReferenceSettings getUnterminatedCharacterReferenceSettings(final boolean insideAttributeValue) {
|
549 |
return insideAttributeValue ? unterminatedCharacterReferenceSettingsInsideAttributeValue : unterminatedCharacterReferenceSettingsOutsideAttributeValue;
|
550 |
}
|
551 |
}
|
552 |
}
|