/[aagtl_public1]/src/net/htmlparser/jericho/TagType.java
aagtl

Contents of /src/net/htmlparser/jericho/TagType.java

Parent Directory Parent Directory | Revision Log Revision Log


Revision 2 - (show annotations) (download)
Sun Aug 5 13:48:36 2012 UTC (11 years, 7 months ago) by zoffadmin
File size: 46895 byte(s)
initial import of aagtl source code
1 // Jericho HTML Parser - Java based library for analysing and manipulating HTML
2 // Version 3.2
3 // Copyright (C) 2004-2009 Martin Jericho
4 // http://jericho.htmlparser.net/
5 //
6 // This library is free software; you can redistribute it and/or
7 // modify it under the terms of either one of the following licences:
8 //
9 // 1. The Eclipse Public License (EPL) version 1.0,
10 // included in this distribution in the file licence-epl-1.0.html
11 // or available at http://www.eclipse.org/legal/epl-v10.html
12 //
13 // 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
14 // included in this distribution in the file licence-lgpl-2.1.txt
15 // or available at http://www.gnu.org/licenses/lgpl.txt
16 //
17 // This library is distributed on an "AS IS" basis,
18 // WITHOUT WARRANTY OF ANY KIND, either express or implied.
19 // See the individual licence texts for more details.
20
21 package net.htmlparser.jericho;
22
23 import java.util.*;
24
25 /**
26 * Defines the syntax for a tag type that can be recognised by the parser.
27 * <p>
28 * This class is the root abstract class common to all tag types, and contains methods to {@linkplain #register() register}
29 * and {@linkplain #deregister() deregister} tag types as well as various methods to aid in their implementation.
30 * <p>
31 * Every tag type is represented by a singleton instance of a class that must be a subclass of either
32 * {@link StartTagType} or {@link EndTagType}. These two abstract classes, the only direct descendants of this class,
33 * represent the two major classifications under which every tag type exists.
34 * <p>
35 * Because all <code>TagType</code> instaces must be singletons, the '<code>==</code>' operator can be used to test for a particular tag type
36 * instead of the <code>equals(Object)</code> method.
37 * <p>
38 * The term <i><a name="Predefined">predefined tag type</a></i> refers to any of the tag types defined in this library,
39 * including both <a href="#Standard">standard</a> and <a href="#Extended">extended</a> tag types.
40 * <p>
41 * The term <i><a name="Standard">standard tag type</a></i> refers to any of the tag types represented by instances
42 * in static fields of the {@link StartTagType} and {@link EndTagType} subclasses.
43 * Standard tag types are registered by default, and define the tags most commonly found in HTML documents.
44 * <p>
45 * The term <i><a name="Extended">extended tag type</a></i> refers to any <a href="#Predefined">predefined</a> tag type
46 * that is not a <a href="#Standard">standard</a> tag type.
47 * The {@link PHPTagTypes} and {@link MasonTagTypes} classes contain extended tag types related to their respective server platforms.
48 * The tag types defined within them must be registered by the user before they are recognised by the parser.
49 * <p>
50 * The term <i><a name="Custom">custom tag type</a></i> refers to any user-defined tag type, or any tag type that is
51 * not a <a href="#Predefined">predefined</a> tag type.
52 * <p>
53 * The tag recognition process of the parser gives each tag type a <i><a name="Precedence">precedence</a></i> level,
54 * which is primarily determined by the length of its {@linkplain #getStartDelimiter() start delimiter}.
55 * A tag type with a more specific start delimiter is chosen in preference to one with a less specific start delimiter,
56 * assuming they both share the same prefix. If two tag types have exactly the same start delimiter, the one which was
57 * {@linkplain #register() registered} later has the higher precedence.
58 * <p>
59 * The two special tag types {@link StartTagType#UNREGISTERED} and {@link EndTagType#UNREGISTERED} represent
60 * tags that do not match the syntax of any other tag type. They have the lowest <a href="#Precedence">precedence</a>
61 * of all the tag types. The {@link Tag#isUnregistered()} method provides a detailed explanation of unregistered tags.
62 * <p>
63 * See the documentation of the <a href="Tag.html#ParsingProcess">tag parsing process</a> for more information
64 * on how each tag is identified by the parser.
65 * <p>
66 * <a name="Normal"></a>Note that the standard {@linkplain HTMLElementName HTML element names} do not represent different
67 * tag <i>types</i>. All standard HTML tags have a tag type of {@link StartTagType#NORMAL} or {@link EndTagType#NORMAL},
68 * and are also referred to as <i>normal</i> tags.
69 * <p>
70 * Apart from the <a href="#RegistrationRelated">registration related</a> methods, all of the methods in this class and its
71 * subclasses relate to the implementation of <a href="#Custom">custom tag types</a> and are not relevant to the majority of users
72 * who just use the <a href="#Predefined">predefined tag types</a>.
73 * <p>
74 * For perfomance reasons, this library only allows tag types that {@linkplain #getStartDelimiter() start}
75 * with a '<code>&lt;</code>' character.
76 * The character following this defines the immediate subclass of the tag type.
77 * An {@link EndTagType} always has a slash ('<code>/</code>') as the second character, while a {@link StartTagType}
78 * has any character other than a slash as the second character.
79 * This definition means that tag types which are not intuitively classified as either start tag types or end tag types
80 * (such as an HTML {@linkplain StartTagType#COMMENT comment}) are mostly classified as start tag types.
81 * <p>
82 * Every method in this and the {@link StartTagType} and {@link EndTagType} abstract classes can be categorised
83 * as one of the following:
84 * <dl>
85 * <dt><a name="Property">Properties:</a>
86 * <dd>Simple properties (marked final) that were either specified as parameters
87 * during construction or are derived from those parameters.
88 * <dt><a name="AbstractImplementation">Abstract implementation methods:</a>
89 * <dd>Methods that must be implemented in a subclass.
90 * <dt><a name="DefaultImplementation">Default implementation methods:</a>
91 * <dd>Methods (not marked final) that implement common behaviour, but may be overridden in a subclass.
92 * <dt><a name="ImplementationAssistance">Implementation assistance methods:</a>
93 * <dd>Protected methods that provide low-level functionality and are only of use within other implementation methods.
94 * <dt><a name="RegistrationRelated">Registration related methods:</a>
95 * <dd>Utility methods (marked final) relating to the {@linkplain #register() registration} of tag type instances.
96 * </dl>
97 */
98 public abstract class TagType {
99 private final String description;
100 private final String startDelimiter;
101 private final String closingDelimiter;
102 private final boolean isServerTag;
103 private final String namePrefix;
104 final String startDelimiterPrefix;
105
106 private static Logger logger=null;
107
108 TagType(final String description, final String startDelimiter, final String closingDelimiter, final boolean isServerTag, final String startDelimiterPrefix) {
109 // startDelimiterPrefix is either "<" or "</"
110 this.description=description;
111 this.startDelimiter=startDelimiter;
112 this.closingDelimiter=closingDelimiter;
113 this.isServerTag=isServerTag;
114 this.namePrefix=startDelimiter.substring(startDelimiterPrefix.length());
115 this.startDelimiterPrefix=startDelimiterPrefix;
116 }
117
118 /**
119 * Registers this tag type for recognition by the parser.
120 * <br />(<a href="TagType.html#RegistrationRelated">registration related</a> method)
121 * <p>
122 * The order of registration affects the <a href="TagType.html#Precedence">precedence</a> of the tag type when a potential tag is being parsed.
123 *
124 * @see #deregister()
125 */
126 public final void register() {
127 getLogger().debug("Register tag type: "+this);
128 TagTypeRegister.add(this);
129 }
130
131 /**
132 * Deregisters this tag type.
133 * <br />(<a href="TagType.html#RegistrationRelated">registration related</a> method)
134 *
135 * @see #register()
136 */
137 public final void deregister() {
138 getLogger().debug("Deregister tag type "+this);
139 TagTypeRegister.remove(this);
140 }
141
142 /**
143 * Returns a list of all the currently registered tag types in order of lowest to highest <a href="TagType.html#Precedence">precedence</a>.
144 * <br />(<a href="TagType.html#RegistrationRelated">registration related</a> method)
145 * @return a list of all the currently registered tag types in order of lowest to highest <a href="TagType.html#Precedence">precedence</a>.
146 */
147 public static final List<TagType> getRegisteredTagTypes() {
148 return TagTypeRegister.getList();
149 }
150
151 /**
152 * Returns a description of this tag type useful for debugging purposes.
153 * <br />(<a href="TagType.html#Property">property</a> method)
154 *
155 * @return a description of this tag type useful for debugging purposes.
156 */
157 public final String getDescription() {
158 return description;
159 }
160
161 /**
162 * Returns the character sequence that marks the start of the tag.
163 * <br />(<a href="TagType.html#Property">property</a> method)
164 * <p>
165 * The character sequence must be all in lower case.
166 * <p>
167 * The first character in this property <b>must</b> be '<code>&lt;</code>'.
168 * This is a deliberate limitation of the system which is necessary to retain reasonable performance.
169 * <p>
170 * The second character in this property must be '<code>/</code>' if the implementing class is an {@link EndTagType}.
171 * It must <b>not</b> be '<code>/</code>' if the implementing class is a {@link StartTagType}.
172 * <p>
173 * <dl>
174 * <dt>Standard Tag Type Values:</dt>
175 * <dd>
176 * <table class="bordered" style="margin: 15px" cellspacing="0">
177 * <tr><th>Tag Type<th>Start Delimiter
178 * <tr><td>{@link StartTagType#UNREGISTERED}<td><code>&lt;</code>
179 * <tr><td>{@link StartTagType#NORMAL}<td><code>&lt;</code>
180 * <tr><td>{@link StartTagType#COMMENT}<td><code>&lt;!--</code>
181 * <tr><td>{@link StartTagType#XML_DECLARATION}<td><code>&lt;?xml</code>
182 * <tr><td>{@link StartTagType#XML_PROCESSING_INSTRUCTION}<td><code>&lt;?</code>
183 * <tr><td>{@link StartTagType#DOCTYPE_DECLARATION}<td><code>&lt;!doctype</code>
184 * <tr><td>{@link StartTagType#MARKUP_DECLARATION}<td><code>&lt;!</code>
185 * <tr><td>{@link StartTagType#CDATA_SECTION}<td><code>&lt;![cdata[</code>
186 * <tr><td>{@link StartTagType#SERVER_COMMON}<td><code>&lt;%</code>
187 * <tr><td>{@link EndTagType#UNREGISTERED}<td><code>&lt;/</code>
188 * <tr><td>{@link EndTagType#NORMAL}<td><code>&lt;/</code>
189 * </table>
190 * </dl>
191 * <dl>
192 * <dt>Extended Tag Type Values:</dt>
193 * <dd>
194 * <table class="bordered" style="margin: 15px" cellspacing="0">
195 * <tr><th>Tag Type<th>Start Delimiter
196 * <tr><td>{@link MicrosoftConditionalCommentTagTypes#DOWNLEVEL_HIDDEN_IF}<td><code>&lt;!--[if</code>
197 * <tr><td>{@link MicrosoftConditionalCommentTagTypes#DOWNLEVEL_HIDDEN_ENDIF}<td><code>&lt;![endif]--&gt;</code>
198 * <tr><td>{@link MicrosoftConditionalCommentTagTypes#DOWNLEVEL_REVEALED_IF}<td><code>&lt;![if</code>
199 * <tr><td>{@link MicrosoftConditionalCommentTagTypes#DOWNLEVEL_REVEALED_ENDIF}<td><code>&lt;![endif]&gt;</code>
200 * <tr><td>{@link MicrosoftConditionalCommentTagTypes#DOWNLEVEL_REVEALED_VALIDATING_IF}<td><code>&lt;!--[if</code>
201 * <tr><td>{@link MicrosoftConditionalCommentTagTypes#DOWNLEVEL_REVEALED_VALIDATING_ENDIF}<td><code>&lt;!--&lt;![endif]--&gt;</code>
202 * <tr><td>{@link MicrosoftConditionalCommentTagTypes#DOWNLEVEL_REVEALED_VALIDATING_SIMPLIFIED_IF}<td><code>&lt;!--[if</code>
203 * <tr><td>{@link PHPTagTypes#PHP_SCRIPT}<td><code>&lt;script</code>
204 * <tr><td>{@link PHPTagTypes#PHP_SHORT}<td><code>&lt;?</code>
205 * <tr><td>{@link PHPTagTypes#PHP_STANDARD}<td><code>&lt;?php</code>
206 * <tr><td>{@link MasonTagTypes#MASON_COMPONENT_CALL}<td><code>&lt;&amp;</code>
207 * <tr><td>{@link MasonTagTypes#MASON_COMPONENT_CALLED_WITH_CONTENT}<td><code>&lt;&amp;|</code>
208 * <tr><td>{@link MasonTagTypes#MASON_COMPONENT_CALLED_WITH_CONTENT_END}<td><code>&lt;/&amp;</code>
209 * <tr><td>{@link MasonTagTypes#MASON_NAMED_BLOCK}<td><code>&lt;%</code>
210 * <tr><td>{@link MasonTagTypes#MASON_NAMED_BLOCK_END}<td><code>&lt;/%</code>
211 * </table>
212 * </dl>
213 *
214 * @return the character sequence that marks the start of the tag.
215 */
216 public final String getStartDelimiter() {
217 return startDelimiter;
218 }
219
220 /**
221 * Returns the character sequence that marks the end of the tag.
222 * <br />(<a href="TagType.html#Property">property</a> method)
223 * <p>
224 * The character sequence must be all in lower case.
225 * <p>
226 * In a {@link StartTag} of a {@linkplain StartTagType type} that {@linkplain StartTagType#hasAttributes() has attributes},
227 * characters appearing inside a quoted attribute value are ignored when determining the location of the closing delimiter.
228 * <p>
229 * Note that the optional '<code>/</code>' character preceding the closing '<code>&gt;</code>' in an
230 * {@linkplain StartTag#isEmptyElementTag() empty-element tag} is not considered part of the end delimiter.
231 * This property must define the closing delimiter common to all instances of the tag type.
232 * <p>
233 * <dl>
234 * <dt>Standard Tag Type Values:</dt>
235 * <dd>
236 * <table class="bordered" style="margin: 15px" cellspacing="0">
237 * <tr><th>Tag Type<th>Closing Delimiter
238 * <tr><td>{@link StartTagType#UNREGISTERED}<td><code>&gt;</code>
239 * <tr><td>{@link StartTagType#NORMAL}<td><code>&gt;</code>
240 * <tr><td>{@link StartTagType#COMMENT}<td><code>--&gt;</code>
241 * <tr><td>{@link StartTagType#XML_DECLARATION}<td><code>?&gt;</code>
242 * <tr><td>{@link StartTagType#XML_PROCESSING_INSTRUCTION}<td><code>?&gt;</code>
243 * <tr><td>{@link StartTagType#DOCTYPE_DECLARATION}<td><code>&gt;</code>
244 * <tr><td>{@link StartTagType#MARKUP_DECLARATION}<td><code>&gt;</code>
245 * <tr><td>{@link StartTagType#CDATA_SECTION}<td><code>]]&gt;</code>
246 * <tr><td>{@link StartTagType#SERVER_COMMON}<td><code>%&gt;</code>
247 * <tr><td>{@link EndTagType#UNREGISTERED}<td><code>&gt;</code>
248 * <tr><td>{@link EndTagType#NORMAL}<td><code>&gt;</code>
249 * </table>
250 * </dl>
251 * <dl>
252 * <dt>Extended Tag Type Values:</dt>
253 * <dd>
254 * <table class="bordered" style="margin: 15px" cellspacing="0">
255 * <tr><th>Tag Type<th>Closing Delimiter
256 * <tr><td>{@link MicrosoftConditionalCommentTagTypes#DOWNLEVEL_HIDDEN_IF}<td><code>]&gt;</code>
257 * <tr><td>{@link MicrosoftConditionalCommentTagTypes#DOWNLEVEL_HIDDEN_ENDIF}<td><i>(empty&nbsp;string)</i>
258 * <tr><td>{@link MicrosoftConditionalCommentTagTypes#DOWNLEVEL_REVEALED_IF}<td><code>]&gt;</code>
259 * <tr><td>{@link MicrosoftConditionalCommentTagTypes#DOWNLEVEL_REVEALED_ENDIF}<td><i>(empty&nbsp;string)</i>
260 * <tr><td>{@link MicrosoftConditionalCommentTagTypes#DOWNLEVEL_REVEALED_VALIDATING_IF}<td><code>]&gt;&lt;!--&gt;</code>
261 * <tr><td>{@link MicrosoftConditionalCommentTagTypes#DOWNLEVEL_REVEALED_VALIDATING_ENDIF}<td><i>(empty&nbsp;string)</i>
262 * <tr><td>{@link MicrosoftConditionalCommentTagTypes#DOWNLEVEL_REVEALED_VALIDATING_SIMPLIFIED_IF}<td><code>]&gt;--&gt;</code>
263 * <tr><td>{@link PHPTagTypes#PHP_SCRIPT}<td><code>&gt;</code>
264 * <tr><td>{@link PHPTagTypes#PHP_SHORT}<td><code>?&gt;</code>
265 * <tr><td>{@link PHPTagTypes#PHP_STANDARD}<td><code>?&gt;</code>
266 * <tr><td>{@link MasonTagTypes#MASON_COMPONENT_CALL}<td><code>&amp;&gt;</code>
267 * <tr><td>{@link MasonTagTypes#MASON_COMPONENT_CALLED_WITH_CONTENT}<td><code>&amp;&gt;</code>
268 * <tr><td>{@link MasonTagTypes#MASON_COMPONENT_CALLED_WITH_CONTENT_END}<td><code>&gt;</code>
269 * <tr><td>{@link MasonTagTypes#MASON_NAMED_BLOCK}<td><code>&gt;</code>
270 * <tr><td>{@link MasonTagTypes#MASON_NAMED_BLOCK_END}<td><code>&gt;</code>
271 * </table>
272 * </dl>
273 *
274 * @return the character sequence that marks the end of the tag.
275 */
276 public final String getClosingDelimiter() {
277 return closingDelimiter;
278 }
279
280 /**
281 * Indicates whether this tag type represents a server tag.
282 * <br />(<a href="TagType.html#Property">property</a> method)
283 * <p>
284 * Server tags are typically parsed by some process on the web server and substituted with other text or markup before delivery to the
285 * <a target="_blank" href="http://www.w3.org/TR/html401/conform.html#didx-user_agent">user agent</a>.
286 * This parser therefore handles them differently to non-server tags in that they can occur at any position in the document
287 * without regard for the HTML document structure.
288 * As a result they can occur anywhere inside any other tag, although a non-server tag cannot theoretically occur inside a server tag.
289 * <p>
290 * The documentation of the <a href="Tag.html#ParsingProcess">tag parsing process</a> explains in detail
291 * how the value of this property affects the recognition of server tags,
292 * as well as how the presence of server tags affects the recognition of non-server tags in and around them.
293 * <p>
294 * Most XML-style server tags can not be represented as a distinct tag type because they are generally indistinguishable from non-server XML tags.
295 * See the {@link Segment#ignoreWhenParsing()} method for information about how to prevent such server tags from interfering with the proper parsing
296 * of the rest of the document.
297 * <p>
298 * <dl>
299 * <dt>Standard Tag Type Values:</dt>
300 * <dd>
301 * <table class="bordered" style="margin: 15px" cellspacing="0">
302 * <tr><th>Tag Type<th>Is Server Tag
303 * <tr><td>{@link StartTagType#UNREGISTERED}<td><code>false</code>
304 * <tr><td>{@link StartTagType#NORMAL}<td><code>false</code>
305 * <tr><td>{@link StartTagType#COMMENT}<td><code>false</code>
306 * <tr><td>{@link StartTagType#XML_DECLARATION}<td><code>false</code>
307 * <tr><td>{@link StartTagType#XML_PROCESSING_INSTRUCTION}<td><code>false</code>
308 * <tr><td>{@link StartTagType#DOCTYPE_DECLARATION}<td><code>false</code>
309 * <tr><td>{@link StartTagType#MARKUP_DECLARATION}<td><code>false</code>
310 * <tr><td>{@link StartTagType#CDATA_SECTION}<td><code>false</code>
311 * <tr><td>{@link StartTagType#SERVER_COMMON}<td><code>true</code>
312 * <tr><td>{@link EndTagType#UNREGISTERED}<td><code>false</code>
313 * <tr><td>{@link EndTagType#NORMAL}<td><code>false</code>
314 * </table>
315 * </dl>
316 * <dl>
317 * <dt>Extended Tag Type Values:</dt>
318 * <dd>
319 * <table class="bordered" style="margin: 15px" cellspacing="0">
320 * <tr><th>Tag Type<th>Is Server Tag
321 * <tr><td>{@link MicrosoftConditionalCommentTagTypes#DOWNLEVEL_HIDDEN_IF}<td><code>false</code>
322 * <tr><td>{@link MicrosoftConditionalCommentTagTypes#DOWNLEVEL_HIDDEN_ENDIF}<td><code>false</code>
323 * <tr><td>{@link MicrosoftConditionalCommentTagTypes#DOWNLEVEL_REVEALED_IF}<td><code>false</code>
324 * <tr><td>{@link MicrosoftConditionalCommentTagTypes#DOWNLEVEL_REVEALED_ENDIF}<td><code>false</code>
325 * <tr><td>{@link MicrosoftConditionalCommentTagTypes#DOWNLEVEL_REVEALED_VALIDATING_IF}<td><code>false</code>
326 * <tr><td>{@link MicrosoftConditionalCommentTagTypes#DOWNLEVEL_REVEALED_VALIDATING_ENDIF}<td><code>false</code>
327 * <tr><td>{@link MicrosoftConditionalCommentTagTypes#DOWNLEVEL_REVEALED_VALIDATING_SIMPLIFIED_IF}<td><code>false</code>
328 * <tr><td>{@link PHPTagTypes#PHP_SCRIPT}<td><code>true</code>
329 * <tr><td>{@link PHPTagTypes#PHP_SHORT}<td><code>true</code>
330 * <tr><td>{@link PHPTagTypes#PHP_STANDARD}<td><code>true</code>
331 * <tr><td>{@link MasonTagTypes#MASON_COMPONENT_CALL}<td><code>true</code>
332 * <tr><td>{@link MasonTagTypes#MASON_COMPONENT_CALLED_WITH_CONTENT}<td><code>true</code>
333 * <tr><td>{@link MasonTagTypes#MASON_COMPONENT_CALLED_WITH_CONTENT_END}<td><code>true</code>
334 * <tr><td>{@link MasonTagTypes#MASON_NAMED_BLOCK}<td><code>true</code>
335 * <tr><td>{@link MasonTagTypes#MASON_NAMED_BLOCK_END}<td><code>true</code>
336 * </table>
337 * </dl>
338 *
339 * @return <code>true</code> if this tag type represents a server tag, otherwise <code>false</code>.
340 */
341 public final boolean isServerTag() {
342 return isServerTag;
343 }
344
345 /**
346 * Returns the {@linkplain Tag#getName() name} prefix required by this tag type.
347 * <br />(<a href="TagType.html#Property">property</a> method)
348 * <p>
349 * This string is identical to the {@linkplain #getStartDelimiter() start delimiter}, except that it does not include the
350 * initial "<code>&lt;</code>" or "<code>&lt;/</code>" characters that always prefix the start delimiter of a
351 * {@link StartTagType} or {@link EndTagType} respectively.
352 * <p>
353 * The {@linkplain Tag#getName() name} of a tag of this type may or may not include extra characters after the prefix.
354 * This is determined by properties such as {@link StartTagType#isNameAfterPrefixRequired()}
355 * or {@link EndTagTypeGenericImplementation#isStatic()}.
356 * <p>
357 * <dl>
358 * <dt>Standard Tag Type Values:</dt>
359 * <dd>
360 * <table class="bordered" style="margin: 15px" cellspacing="0">
361 * <tr><th>Tag Type<th>Name Prefix
362 * <tr><td>{@link StartTagType#UNREGISTERED}<td><i>(empty&nbsp;string)</i>
363 * <tr><td>{@link StartTagType#NORMAL}<td><i>(empty&nbsp;string)</i>
364 * <tr><td>{@link StartTagType#COMMENT}<td><code>!--</code>
365 * <tr><td>{@link StartTagType#XML_DECLARATION}<td><code>?xml</code>
366 * <tr><td>{@link StartTagType#XML_PROCESSING_INSTRUCTION}<td><code>?</code>
367 * <tr><td>{@link StartTagType#DOCTYPE_DECLARATION}<td><code>!doctype</code>
368 * <tr><td>{@link StartTagType#MARKUP_DECLARATION}<td><code>!</code>
369 * <tr><td>{@link StartTagType#CDATA_SECTION}<td><code>![cdata[</code>
370 * <tr><td>{@link StartTagType#SERVER_COMMON}<td><code>%</code>
371 * <tr><td>{@link EndTagType#UNREGISTERED}<td><i>(empty&nbsp;string)</i>
372 * <tr><td>{@link EndTagType#NORMAL}<td><i>(empty&nbsp;string)</i>
373 * </table>
374 * </dl>
375 * <dl>
376 * <dt>Extended Tag Type Values:</dt>
377 * <dd>
378 * <table class="bordered" style="margin: 15px" cellspacing="0">
379 * <tr><th>Tag Type<th>Name Prefix
380 * <tr><td>{@link MicrosoftConditionalCommentTagTypes#DOWNLEVEL_HIDDEN_IF}<td><code>!--[if</code>
381 * <tr><td>{@link MicrosoftConditionalCommentTagTypes#DOWNLEVEL_HIDDEN_ENDIF}<td><code>![endif]--&gt;</code>
382 * <tr><td>{@link MicrosoftConditionalCommentTagTypes#DOWNLEVEL_REVEALED_IF}<td><code>![if</code>
383 * <tr><td>{@link MicrosoftConditionalCommentTagTypes#DOWNLEVEL_REVEALED_ENDIF}<td><code>![endif]&gt;</code>
384 * <tr><td>{@link MicrosoftConditionalCommentTagTypes#DOWNLEVEL_REVEALED_VALIDATING_IF}<td><code>!--[if</code>
385 * <tr><td>{@link MicrosoftConditionalCommentTagTypes#DOWNLEVEL_REVEALED_VALIDATING_ENDIF}<td><code>!--&lt;![endif]--&gt;</code>
386 * <tr><td>{@link MicrosoftConditionalCommentTagTypes#DOWNLEVEL_REVEALED_VALIDATING_SIMPLIFIED_IF}<td><code>!--[if</code>
387 * <tr><td>{@link PHPTagTypes#PHP_SCRIPT}<td><code>script</code>
388 * <tr><td>{@link PHPTagTypes#PHP_SHORT}<td><code>?</code>
389 * <tr><td>{@link PHPTagTypes#PHP_STANDARD}<td><code>?php</code>
390 * <tr><td>{@link MasonTagTypes#MASON_COMPONENT_CALL}<td><code>&amp;</code>
391 * <tr><td>{@link MasonTagTypes#MASON_COMPONENT_CALLED_WITH_CONTENT}<td><code>&amp;|</code>
392 * <tr><td>{@link MasonTagTypes#MASON_COMPONENT_CALLED_WITH_CONTENT_END}<td><code>&amp;</code>
393 * <tr><td>{@link MasonTagTypes#MASON_NAMED_BLOCK}<td><code>%</code>
394 * <tr><td>{@link MasonTagTypes#MASON_NAMED_BLOCK_END}<td><code>%</code>
395 * </table>
396 * </dl>
397 *
398 * @return the {@linkplain Tag#getName() name} prefix required by this tag type.
399 * @see #getStartDelimiter()
400 */
401 protected final String getNamePrefix() {
402 return namePrefix;
403 }
404
405 /**
406 * Indicates whether a tag of this type is valid in the specified position of the specified source document.
407 * <br />(<a href="TagType.html#ImplementationAssistance">implementation assistance</a> method)
408 * <p>
409 * This method is called immediately before {@link #constructTagAt(Source, int pos)}
410 * to do a preliminary check on the validity of a tag of this type in the specified position.
411 * <p>
412 * This check is not performed as part of the {@link #constructTagAt(Source, int pos)} call because the same
413 * validation is used for all the <a href="TagType.html#Standard">standard</a> tag types, and is likely to be sufficient
414 * for all <a href="TagType.html#Custom">custom tag types</a>.
415 * Having this check separated into a different method helps to isolate common code from the code that is unique to each tag type.
416 * <p>
417 * A {@linkplain TagType#isServerTag() server tag} is valid in any position except inside a {@linkplain StartTagType#SERVER_COMMON_COMMENT server-side comment},
418 * but a non-server tag is not valid inside any other tag, nor inside elements with CDATA content such as
419 * {@link HTMLElementName#SCRIPT SCRIPT} and {@link HTMLElementName#STYLE STYLE} elements.
420 * <p>
421 * The common implementation of this method behaves differently depending upon whether or not a {@linkplain Source#fullSequentialParse() full sequential parse}
422 * is being peformed.
423 * <p>
424 * For server tags it simply checks that the position is not enclosed by a {@linkplain StartTagType#SERVER_COMMON_COMMENT server-side comment} if a full sequential parse
425 * is not being performed. If a full sequential parse is being performed, it always returns <code>true</code> for server tags as the parser automatically skips over
426 * all positions enclosed by server-side comments, so this method is only called in positions where a server tag is always valid.
427 * <p>
428 * When this method is called for non-server tags during a full sequential parse, the <code>fullSequentialParseData</code> argument contains information
429 * allowing the exact theoretical check to be performed, rejecting a non-server tag if it is inside any other tag.
430 * See below for further information about the <code>fullSequentialParseData</code> parameter.
431 * <p>
432 * When this method is called in <a href="Source.html#ParseOnDemand">parse on demand</a> mode
433 * (not during a full sequential parse, <code>fullSequentialParseData==null</code>),
434 * practical constraints prevent the exact theoretical check from being carried out, and non-server tags are only rejected
435 * if they are found inside HTML {@linkplain StartTagType#COMMENT comments} or {@linkplain StartTagType#CDATA_SECTION CDATA sections}.
436 * <p>
437 * This behaviour is configurable by manipulating the static {@link TagType#getTagTypesIgnoringEnclosedMarkup() TagTypesIgnoringEnclosedMarkup} array
438 * to determine which tag types can not contain non-server tags in <a href="Source.html#ParseOnDemand">parse on demand</a> mode.
439 * The {@linkplain TagType#getTagTypesIgnoringEnclosedMarkup() documentation of this property} contains
440 * a more detailed analysis of the subject and explains why only the {@linkplain StartTagType#COMMENT comment} and
441 * {@linkplain StartTagType#CDATA_SECTION CDATA section} tag types are included by default.
442 * <p>
443 * See the documentation of the <a href="Tag.html#ParsingProcess">tag parsing process</a> for more information about how this method fits into the whole tag parsing process.
444 * <p>
445 * This method can be overridden in <a href="TagType.html#Custom">custom tag types</a> if the default implementation is unsuitable.
446 * <p>
447 * <b>The <code>fullSequentialParseData</code> parameter:</b>
448 * <p>
449 * This parameter is used to discard non-server tags that are found inside other tags or inside {@link HTMLElementName#SCRIPT SCRIPT} elements.
450 * <p>
451 * In the current version of this library, the <code>fullSequentialParseData</code> argument is either <code>null</code>
452 * (in <a href="Source.html#ParseOnDemand">parse on demand</a> mode) or an integer array containing only a single entry
453 * (if a {@linkplain Source#fullSequentialParse() full sequential parse} is being peformed).
454 * <p>
455 * The integer contained in the array is the maximum position in the document at which the end of a tag has been found,
456 * indicating that no non-server tags should be recognised before that position.
457 * If no tags have yet been encountered, the value of this integer is zero.
458 * <p>
459 * If the last tag encountered was the {@linkplain StartTag start tag} of a {@link HTMLElementName#SCRIPT SCRIPT} element,
460 * the value of this integer is <code>Integer.MAX_VALUE</code>, indicating that no other non-server elements should be recognised until the
461 * {@linkplain EndTag end tag} of the {@link HTMLElementName#SCRIPT SCRIPT} element is found.
462 * According to the <a target="_blank" href="http://www.w3.org/TR/html401/types.html#idx-CDATA-1">HTML 4.01 specification section 6.2</a>,
463 * the first occurrence of the character sequence "<code>&lt;/</code>" terminates the special handling of CDATA within
464 * {@link HTMLElementName#SCRIPT SCRIPT} and {@link HTMLElementName#STYLE STYLE} elements.
465 * This library however only terminates the CDATA handling of {@link HTMLElementName#SCRIPT SCRIPT} element content
466 * when the character sequence "<code>&lt;/script</code>" is detected, in line with the behaviour of the major browsers.
467 * <p>
468 * Note that the implicit treatment of {@link HTMLElementName#SCRIPT SCRIPT} element content as CDATA should theoretically also prevent the recognition of
469 * {@linkplain StartTagType#COMMENT comments} and explicit {@linkplain StartTagType#CDATA_SECTION CDATA sections} inside script elements.
470 * While this is true for explicit {@linkplain StartTagType#CDATA_SECTION CDATA sections}, the parser does still recognise
471 * {@linkplain StartTagType#COMMENT comments} inside {@link HTMLElementName#SCRIPT SCRIPT} elements in order to maintain compatability with the major browsers.
472 * This prevents the character sequence "<code>&lt;/script</code>" from terminating the {@link HTMLElementName#SCRIPT SCRIPT} element
473 * if it occurs inside a {@linkplain StartTagType#COMMENT comment}. The end of the {@linkplain StartTagType#COMMENT comment} however also
474 * ends the implicit treatment of the {@link HTMLElementName#SCRIPT SCRIPT} element content as CDATA.
475 * <p>
476 * Although {@link HTMLElementName#STYLE STYLE} elements should theoretically be treated in the same way as {@link HTMLElementName#SCRIPT SCRIPT} elements,
477 * the syntax of <a target="_blank" href="http://www.w3.org/Style/CSS/">Cascading Style Sheets</a> (CSS) does not contain any constructs that
478 * could be misinterpreted as HTML tags, so there is virtually no need to perform any special checks in this case.
479 * <p>
480 * IMPLEMENTATION NOTE: The rationale behind using an integer array to hold this value, rather than a scalar <code>int</code> value,
481 * is to emulate passing the parameter by reference.
482 * This value needs to be shared amongst several internal methods during the {@linkplain Source#fullSequentialParse() full sequential parse} process,
483 * and any one of those methods needs to be able to modify the value and pass it back to the calling method.
484 * This would normally be implemented by passing the parameter by reference, but because Java does not support this language construct, a container for a
485 * mutable integer must be passed instead.
486 * Because the standard Java library does not provide a class for holding a single mutable integer (the <code>java.lang.Integer</code> class is immutable),
487 * the easiest container to use, without creating a class especially for this purpose, is an integer array.
488 * The use of an array does not imply any intention to use more than a single array entry in subsequent versions.
489 *
490 * @param source the {@link Source} document.
491 * @param pos the character position in the source document to check.
492 * @param fullSequentialParseData an integer array containing data allowing this method to implement a better algorithm when a {@linkplain Source#fullSequentialParse() full sequential parse} is being performed, or <code>null</code> in <a href="Source.html#ParseOnDemand">parse on demand</a> mode.
493 * @return <code>true</code> if a tag of this type is valid in the specified position of the specified source document, otherwise <code>false</code>.
494 */
495 protected boolean isValidPosition(final Source source, final int pos, final int[] fullSequentialParseData) {
496 if (isServerTag()) {
497 // the only thing preventing inclusion of a server tag is if it is enclosed by a server comment.
498 if (fullSequentialParseData!=null) return true; // full sequential parse skips over segments enclosed by server comments so no need to check.
499 return !StartTagType.SERVER_COMMON_COMMENT.tagEncloses(source,pos);
500 }
501 if (fullSequentialParseData!=null) {
502 // use simplified check when doing full sequential parse. Normally we are only able to check whether a tag is inside specially cached
503 // tag types for efficiency reasons, but during a full sequential parse we can reject a tag if it is inside any other tag.
504 if (fullSequentialParseData[0]==Integer.MAX_VALUE) { // we are in a SCRIPT element
505 if (this==EndTagType.NORMAL && source.getParseText().containsAt("</script",pos)) {
506 // The character sequence "</script" terminates the implicit CDATA section inside the SCRIPT element
507 fullSequentialParseData[0]=pos;
508 return true;
509 }
510 if (this==StartTagType.COMMENT) {
511 // Although not technically correct, all major browsers also recognise comments inside SCRIPT elements.
512 // The end of the comment will however terminate the implicit CDATA section inside the SCRIPT element.
513 fullSequentialParseData[0]=pos;
514 return true;
515 }
516 return false; // reject any other tags inside SCRIPT element
517 }
518 return pos>=fullSequentialParseData[0]; // accept the non-server tag only if it is after the end of the last found non-server tag
519 }
520 // Use the normal method of checking whether the position is inside a tag of a tag type that ignores enclosed markup:
521 final TagType[] tagTypesIgnoringEnclosedMarkup=getTagTypesIgnoringEnclosedMarkup();
522 for (int i=0; i<tagTypesIgnoringEnclosedMarkup.length; i++) {
523 final TagType tagTypeIgnoringEnclosedMarkup=tagTypesIgnoringEnclosedMarkup[i];
524 // If this tag type is a comment, don't bother checking whether it is inside another comment.
525 // See javadocs for getTagTypesIgnoringEnclosedMarkup() for more explanation.
526 // Allowing it might result in multiple comments being recognised with the same end delimiter, but the risk of this occuring in a syntactically invalid document
527 // is outweighed by the benefit of not recursively checking all previous comments in a document, risking stack overflow.
528 if (this==StartTagType.COMMENT && tagTypeIgnoringEnclosedMarkup==StartTagType.COMMENT) continue;
529 if (tagTypeIgnoringEnclosedMarkup.tagEncloses(source,pos)) return false;
530 }
531 return true;
532 }
533
534 /**
535 * Returns an array of all the tag types inside which the parser ignores all non-{@linkplain #isServerTag() server} tags
536 * in <a href="Source.html#ParseOnDemand">parse on demand</a> mode.
537 * <br />(<a href="TagType.html#ImplementationAssistance">implementation assistance</a> method)
538 * <p>
539 * The tag types returned by this property (referred to in the following paragraphs as the "listed types") default to
540 * {@link StartTagType#COMMENT} and {@link StartTagType#CDATA_SECTION}.
541 * <p>
542 * This property is used by the default implementation of the {@link #isValidPosition(Source, int pos, int[] fullSequentialParseData) isValidPosition} method
543 * in <a href="Source.html#ParseOnDemand">parse on demand</a> mode.
544 * It is not used at all during a {@linkplain Source#fullSequentialParse() full sequential parse}.
545 * <p>
546 * In the default implementation of the {@link #isValidPosition(Source, int pos, int[] fullSequentialParseData) isValidPosition} method,
547 * in <a href="Source.html#ParseOnDemand">parse on demand</a> mode,
548 * every new non-server tag found by the parser (referred to as a "new tag") undergoes a check to see whether it is enclosed
549 * by a tag of one of the listed types.
550 * This includes new tags of the listed types themselves if they are non-server tags.
551 * The recursive nature of this check means that <i>all</i> tags of the listed types occurring before the new tag must be found
552 * by the parser before it can determine whether the new tag should be ignored.
553 * To mitigate any performance issues arising from this process, the listed types are given special treatment in the tag cache.
554 * This dramatically decreases the time taken to search on these tag types, so adding a tag type to this array that
555 * is easily recognised and occurs infrequently only results in a small degradation in overall performance.
556 * <p>
557 * A special exception to the algorithm described above applies to {@link StartTagType#COMMENT COMMENT} tags.
558 * The default implementation of the {@link #isValidPosition(Source,int,int[]) isValidPosition} method
559 * does not check whether a {@link StartTagType#COMMENT COMMENT} tag is inside another {@link StartTagType#COMMENT COMMENT} tag,
560 * as this should never happen in a syntactically correct document (the characters '<code>--</code>' should not occur inside a comment).
561 * Skipping this check also avoids the need to recursively check every {@link StartTagType#COMMENT COMMENT} tag back to the start of the document,
562 * which has the potential to cause a stack overflow in a large document containing lots of comments.
563 * <p>
564 * Theoretically, non-server tags appearing inside any other tag should be ignored, which is how the parser behaves during a
565 * {@linkplain Source#fullSequentialParse() full sequential parse}.
566 * <p>
567 * Server tags in particular very often contain other "tags" that should not be recognised as tags by the parser.
568 * If this behaviour is required in <a href="Source.html#ParseOnDemand">parse on demand</a>, the tag type of each server tag that might be found
569 * in the source documents can be added to this property using the static {@link #setTagTypesIgnoringEnclosedMarkup(TagType[])} method.
570 * For example, the following command would prevent non-server tags from being recognised inside {@linkplain PHPTagTypes#PHP_STANDARD standard PHP} tags,
571 * as well as the default {@linkplain StartTagType#COMMENT comment} and {@linkplain StartTagType#CDATA_SECTION CDATA section} tags:
572 * <p>
573 * <blockquote><code>TagType.setTagTypesIgnoringEnclosedMarkup(new TagType[] {PHPTagTypes.PHP_STANDARD, StartTagType.COMMENT, StartTagType.CDATA_SECTION});</code></blockquote>
574 * <p>
575 * The only situation where a non-server tag can legitimately contain a sequence of characters that resembles a tag is within an attribute value.
576 * The <a target="_blank" href="http://www.w3.org/TR/html401/charset.html#h-5.3.2">HTML 4.01 specification section 5.3.2</a>
577 * specifically allows the presence of '<code>&lt;</code>' and '<code>&gt;</code>' characters within attribute values.
578 * A common occurrence of this is in <a target="_blank" href="http://www.w3.org/TR/html401/interact/scripts.html#events">event</a> attributes containing scripts,
579 * such as the <code><a target="_blank" href="http://www.w3.org/TR/html401/interact/scripts.html#adef-onclick">onclick</a></code> attribute.
580 * There is no way of preventing such "tags" from being recognised in <a href="Source.html#ParseOnDemand">parse on demand</a> mode, as adding
581 * {@link StartTagType#NORMAL} to this property as a listed type would be far too inefficient.
582 * Performing a {@linkplain Source#fullSequentialParse() full sequential parse} of the source document prevents these attribute values from being
583 * recognised as tags, but can be very expensive if only a few tags in the document need to be parsed.
584 * The penalty of not parsing every tag in the document is that the exactness of this check is compromised, but in practical terms the difference is inconsequential.
585 * The default listed types of {@linkplain StartTagType#COMMENT comments} and {@linkplain StartTagType#CDATA_SECTION CDATA sections} yields sensible results
586 * in the vast majority of practical applications with only a minor impact on performance.
587 * <p>
588 * In <a target="_blank" href="http://www.w3.org/TR/xhtml1/">XHTML</a>, '<code>&lt;</code>' and '<code>&gt;</code>' characters
589 * must be represented in attribute values as {@linkplain CharacterReference character references}
590 * (see the XML 1.0 specification section <a target="_blank" href="http://www.w3.org/TR/REC-xml#CleanAttrVals">3.1</a>),
591 * so the situation should never arise that a tag is found inside another tag unless one of them is a
592 * {@linkplain #isServerTag() server tag}.
593 *
594 * @return an array of all the tag types inside which the parser ignores all non-{@linkplain #isServerTag() server} tags.
595 */
596 public static final TagType[] getTagTypesIgnoringEnclosedMarkup() {
597 return TagTypesIgnoringEnclosedMarkup.array;
598 }
599
600 /**
601 * Sets the tag types inside which the parser ignores all non-{@linkplain #isServerTag() server} tags.
602 * <br />(<a href="TagType.html#ImplementationAssistance">implementation assistance</a> method)
603 * <p>
604 * See {@link #getTagTypesIgnoringEnclosedMarkup()} for the documentation of this property.
605 *
606 * @param tagTypes an array of tag types.
607 */
608 public static final void setTagTypesIgnoringEnclosedMarkup(TagType[] tagTypes) {
609 if (tagTypes==null) throw new IllegalArgumentException();
610 TagTypesIgnoringEnclosedMarkup.array=tagTypes;
611 }
612
613 /**
614 * Constructs a tag of this type at the specified position in the specified source document if it matches all of the required features.
615 * <br />(<a href="TagType.html#AbstractImplementation">abstract implementation</a> method)
616 * <p>
617 * The implementation of this method must check that the text at the specified position meets all of
618 * the criteria of this tag type, including such checks as the presence of the correct or well formed
619 * {@linkplain #getClosingDelimiter() closing delimiter}, {@linkplain Tag#getName() name}, {@linkplain Attributes attributes},
620 * {@linkplain EndTag end tag}, or any other distinguishing features.
621 * <p>
622 * It can be assumed that the specified position starts with the {@linkplain #getStartDelimiter() start delimiter} of this tag type,
623 * and that all other tag types with higher <a href="TagType.html#Precedence">precedence</a> (if any) have already been rejected as candidates.
624 * Tag types with lower precedence will be considered if this method returns <code>null</code>.
625 * <p>
626 * This method is only called after a successful check of the tag's position, i.e.
627 * {@link #isValidPosition(Source,int,int[]) isValidPosition(source,pos,fullSequentialParseData)}<code>==true</code>.
628 * <p>
629 * The {@link StartTagTypeGenericImplementation} and {@link EndTagTypeGenericImplementation} subclasses provide default
630 * implementations of this method that allow the use of much simpler <a href="TagType.html#Property">properties</a> and
631 * <a href="TagType.html#ImplementationAssistance">implementation assistance</a> methods and to carry out the required functions.
632 *
633 * @param source the {@link Source} document.
634 * @param pos the position in the source document.
635 * @return a tag of this type at the specified position in the specified source document if it meets all of the required features, or <code>null</code> if it does not meet the criteria.
636 */
637 protected abstract Tag constructTagAt(Source source, int pos);
638
639 /**
640 * Indicates whether a tag of this type encloses the specified position of the specified source document.
641 * <br />(<a href="TagType.html#ImplementationAssistance">implementation assistance</a> method)
642 * <p>
643 * This is logically equivalent to <code>source.</code>{@link Source#getEnclosingTag(int,TagType) getEnclosingTag(pos,this)}<code>!=null</code>,
644 * but is safe to use within other implementation methods without the risk of causing an infinite recursion.
645 * <p>
646 * This method is called from the default implementation of the {@link #isValidPosition(Source, int pos, int[] fullSequentialParseData)} method.
647 *
648 * @param source the {@link Source} document.
649 * @param pos the character position in the source document to check.
650 * @return <code>true</code> if a tag of this type encloses the specified position of the specified source document, otherwise <code>false</code>.
651 */
652 protected final boolean tagEncloses(final Source source, final int pos) {
653 if (pos==0) return false;
654 final Tag enclosingTag=source.getEnclosingTag(pos-1,this); // use pos-1 otherwise a tag at pos could cause infinite recursion when this is called from constructTagAt
655 return enclosingTag!=null && pos!=enclosingTag.getEnd(); // make sure pos!=enclosingTag.getEnd() to compensate for using pos-1 above (important if the tag in question immediately follows an end tag delimiter)
656 }
657
658 /**
659 * Returns a string representation of this object useful for debugging purposes.
660 * @return a string representation of this object useful for debugging purposes.
661 */
662 public String toString() {
663 return getDescription();
664 }
665
666 static final Tag getTagAt(final Source source, final int pos, final boolean serverTagOnly, final boolean assumeNoNestedTags) {
667 final TagTypeRegister.ProspectiveTagTypeIterator prospectiveTagTypeIterator=new TagTypeRegister.ProspectiveTagTypeIterator(source,pos);
668 // prospectiveTagTypeIterator is empty if pos is out of range.
669 while (prospectiveTagTypeIterator.hasNext()) {
670 final TagType tagType=prospectiveTagTypeIterator.next();
671 if (serverTagOnly && !tagType.isServerTag()) continue;
672 if (!assumeNoNestedTags && !tagType.isValidPosition(source,pos,source.fullSequentialParseData)) continue;
673 try {
674 final Tag tag=tagType.constructTagAt(source,pos);
675 if (tag!=null) return tag;
676 } catch (IndexOutOfBoundsException ex) {
677 if (source.logger.isInfoEnabled()) source.logger.info(source.getRowColumnVector(pos).appendTo(new StringBuilder(200).append("Tag at ")).append(" not recognised as type '").append(tagType.getDescription()).append("' because it has no end delimiter").toString());
678 }
679 }
680 return null;
681 }
682
683 private static Logger getLogger() {
684 if (logger==null) logger=Source.newLogger();
685 return logger;
686 }
687
688 private static final class TagTypesIgnoringEnclosedMarkup {
689 // This internal class is used to contain the array because its static initialisation can occur after
690 // the StartTagType.COMMENT and StartTagType.CDATA_SECTION members have been created.
691 public static TagType[] array=new TagType[] {
692 StartTagType.COMMENT,
693 StartTagType.CDATA_SECTION
694 };
695 }
696 }
697

   
Visit the aagtl Website