/[aagtl_public1]/src/net/htmlparser/jericho/StartTag.java
aagtl

Contents of /src/net/htmlparser/jericho/StartTag.java

Parent Directory Parent Directory | Revision Log Revision Log


Revision 2 - (show annotations) (download)
Sun Aug 5 13:48:36 2012 UTC (11 years, 7 months ago) by zoffadmin
File size: 42936 byte(s)
initial import of aagtl source code
1 // Jericho HTML Parser - Java based library for analysing and manipulating HTML
2 // Version 3.2
3 // Copyright (C) 2004-2009 Martin Jericho
4 // http://jericho.htmlparser.net/
5 //
6 // This library is free software; you can redistribute it and/or
7 // modify it under the terms of either one of the following licences:
8 //
9 // 1. The Eclipse Public License (EPL) version 1.0,
10 // included in this distribution in the file licence-epl-1.0.html
11 // or available at http://www.eclipse.org/legal/epl-v10.html
12 //
13 // 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
14 // included in this distribution in the file licence-lgpl-2.1.txt
15 // or available at http://www.gnu.org/licenses/lgpl.txt
16 //
17 // This library is distributed on an "AS IS" basis,
18 // WITHOUT WARRANTY OF ANY KIND, either express or implied.
19 // See the individual licence texts for more details.
20
21 package net.htmlparser.jericho;
22
23 import java.util.Map;
24 import java.util.Set;
25 import java.io.IOException;
26 import java.util.regex.Pattern;
27
28 /**
29 * Represents the <a target="_blank" href="http://www.w3.org/TR/html401/intro/sgmltut.html#didx-element-2">start tag</a> of an
30 * {@linkplain Element element} in a specific {@linkplain Source source} document.
31 * <p>
32 * A start tag always has a {@linkplain #getTagType() type} that is a subclass of {@link StartTagType}, meaning that any tag
33 * that does <b>not</b> start with the characters '<code>&lt;/</code>' is categorised as a start tag.
34 * <p>
35 * This includes many tags which stand alone, without a {@linkplain StartTagType#getCorrespondingEndTagType() corresponding end tag},
36 * and would not intuitively be categorised as a "start tag".
37 * For example, an HTML {@linkplain StartTagType#COMMENT comment} is represented as a single start tag that spans the whole comment,
38 * and does not have an end tag at all.
39 * <p>
40 * See the <a href="StartTagType.html#field_summary">static fields</a> defined in the {@link StartTagType} class for a list of the
41 * <a href="TagType.html#Standard">standard</a> start tag types.
42 * <p>
43 * <code>StartTag</code> instances are obtained using one of the following methods:
44 * <ul>
45 * <li>{@link Element#getStartTag()}
46 * <li>{@link Tag#getNextTag()}
47 * <li>{@link Tag#getPreviousTag()}
48 * <li>{@link Source#getPreviousStartTag(int pos)}
49 * <li>{@link Source#getPreviousStartTag(int pos, String name)}
50 * <li>{@link Source#getPreviousTag(int pos)}
51 * <li>{@link Source#getPreviousTag(int pos, TagType)}
52 * <li>{@link Source#getNextStartTag(int pos)}
53 * <li>{@link Source#getNextStartTag(int pos, String name)}
54 * <li>{@link Source#getNextStartTag(int pos, String attributeName, String value, boolean valueCaseSensitive)}
55 * <li>{@link Source#getNextTag(int pos)}
56 * <li>{@link Source#getNextTag(int pos, TagType)}
57 * <li>{@link Source#getEnclosingTag(int pos)}
58 * <li>{@link Source#getEnclosingTag(int pos, TagType)}
59 * <li>{@link Source#getTagAt(int pos)}
60 * <li>{@link Segment#getAllStartTags()}
61 * <li>{@link Segment#getAllStartTags(String name)}
62 * <li>{@link Segment#getAllStartTags(String attributeName, String value, boolean valueCaseSensitive)}
63 * <li>{@link Segment#getAllTags()}
64 * <li>{@link Segment#getAllTags(TagType)}
65 * </ul>
66 * <p>
67 * The methods above which accept a <code>name</code> parameter are categorised as <a href="Tag.html#NamedSearch">named search</a> methods.
68 * <p>
69 * In such methods dealing with start tags, specifying an argument to the <code>name</code> parameter that ends in a
70 * colon (<code>:</code>) searches for all start tags in the specified XML namespace.
71 * <p>
72 * The constants defined in the {@link HTMLElementName} interface can be used directly as arguments to these <code>name</code> parameters.
73 * For example, <code>source.getAllStartTags(</code>{@link HTMLElementName#A}<code>)</code> is equivalent to
74 * <code>source.getAllStartTags("a")</code>, and gets all hyperlink start tags.
75 * <p>
76 * The {@link Tag} superclass defines a method called {@link Tag#getName() getName()} to get the name of this start tag.
77 * <p>
78 * See also the XML 1.0 specification for <a target="_blank" href="http://www.w3.org/TR/REC-xml#dt-stag">start tags</a>.
79 *
80 * @see Tag
81 * @see Element
82 * @see EndTag
83 */
84 public final class StartTag extends Tag {
85 private final Attributes attributes;
86 final StartTagType startTagType;
87
88 static final StartTag NOT_CACHED=(StartTag)Tag.NOT_CACHED;
89
90 /**
91 * Constructs a new <code>StartTag</code>.
92 *
93 * @param source the {@link Source} document.
94 * @param begin the character position in the source document where this tag {@linkplain Segment#getBegin() begins}.
95 * @param end the character position in the source document where this tag {@linkplain Segment#getEnd() ends}.
96 * @param startTagType the {@linkplain #getStartTagType() type} of the start tag.
97 * @param name the {@linkplain Tag#getName() name} of the tag.
98 * @param attributes the {@linkplain #getAttributes() attributes} of the tag.
99 */
100 StartTag(final Source source, final int begin, final int end, final StartTagType startTagType, final String name, final Attributes attributes) {
101 super(source,begin,end,name);
102 this.attributes=attributes;
103 this.startTagType=startTagType;
104 if (attributes!=null) attributes.setStartTag(this);
105 }
106
107 // only used to create Tag.NOT_CACHED
108 StartTag() {
109 attributes=null;
110 startTagType=null;
111 }
112
113 /**
114 * Returns the {@linkplain Element element} that is started by this start tag.
115 * Guaranteed not <code>null</code>.
116 * <p>
117 * <dl>
118 * <dt>Example 1: Elements for which the {@linkplain HTMLElements#getEndTagRequiredElementNames() end tag is required}</dt>
119 * <dd>
120 * <pre>
121 * 1. &lt;div&gt;
122 * 2. &lt;div&gt;
123 * 3. &lt;div&gt;
124 * 4. &lt;div&gt;This is line 4&lt;/div&gt;
125 * 5. &lt;/div&gt;
126 * 6. &lt;div&gt;This is line 6&lt;/div&gt;
127 * 7. &lt;/div&gt;</pre>
128 * <ul>
129 * <li>The start tag on line 1 returns an empty element spanning only the start tag.
130 * This is because the end tag of a <code>&lt;div&gt;</code> element is required,
131 * making the sample code invalid as all the end tags are matched with other start tags.
132 * <li>The start tag on line 2 returns an element spanning to the end of line 7.
133 * <li>The start tag on line 3 returns an element spanning to the end of line 5.
134 * <li>The start tag on line 4 returns an element spanning to the end of line 4.
135 * <li>The start tag on line 6 returns an element spanning to the end of line 6.
136 * </ul>
137 * <p>
138 * </dd>
139 * <dt>Example 2: Elements for which the {@linkplain HTMLElements#getEndTagOptionalElementNames() end tag is optional}</dt>
140 * <dd>
141 * <pre>
142 * 1. &lt;ul&gt;
143 * 2. &lt;li&gt;item 1
144 * 3. &lt;li&gt;item 2
145 * 4. &lt;ul&gt;
146 * 5. &lt;li&gt;subitem 1&lt;/li&gt;
147 * 6. &lt;li&gt;subitem 2
148 * 7. &lt;/ul&gt;
149 * 8. &lt;li&gt;item 3&lt;/li&gt;
150 * 9. &lt;/ul&gt;</pre>
151 * <ul>
152 * <li>The start tag on line 1 returns an element spanning to the end of line 9.
153 * <li>The start tag on line 2 returns an element spanning to the start of the <code>&lt;li&gt;</code> start tag on line 3.
154 * <li>The start tag on line 3 returns an element spanning to the start of the <code>&lt;li&gt;</code> start tag on line 8.
155 * <li>The start tag on line 4 returns an element spanning to the end of line 7.
156 * <li>The start tag on line 5 returns an element spanning to the end of line 5.
157 * <li>The start tag on line 6 returns an element spanning to the start of the <code>&lt;/ul&gt;</code> end tag on line 7.
158 * <li>The start tag on line 8 returns an element spanning to the end of line 8.
159 * </ul>
160 * </dd>
161 * </dl>
162 *
163 * @return the {@linkplain Element element} that is started by this start tag.
164 */
165 public Element getElement() {
166 if (element==Element.NOT_CACHED) {
167 final EndTag endTag=getEndTagInternal();
168 element=new Element(source,this,endTag);
169 if (endTag!=null) {
170 if (endTag.element!=Element.NOT_CACHED) {
171 // This is presumably impossible, except in certain circumstances where the cache was cleared, such as if the parser decides to do a full sequential parse after some tags have already been found.
172 // If the existing element and the current element are not the same, log it.
173 if (source.logger.isInfoEnabled() && !element.equals(endTag.element)) source.logger.info(source.getRowColumnVector(endTag.begin).appendTo(new StringBuilder(200).append("End tag ").append(endTag).append(" at ")).append(" terminates more than one element").toString());
174 }
175 endTag.element=element;
176 }
177 }
178 return element;
179 }
180
181 /**
182 * Indicates whether this start tag is an <a target="_blank" href="http://www.w3.org/TR/REC-xml#dt-eetag">empty-element tag</a>.
183 * <p>
184 * This property checks that the the tag is {@linkplain #isSyntacticalEmptyElementTag() syntactically an empty-element tag},
185 * but in addition checks that the {@linkplain #getName() name} of the tag is not one that is defined in the HTML specification to have a
186 * {@linkplain HTMLElements#getEndTagRequiredElementNames() required} or an {@linkplain HTMLElements#getEndTagOptionalElementNames() optional} end tag,
187 * which the major browsers do not recognise as empty-element tags, even in an <a target="_blank" href="http://www.w3.org/TR/xhtml1/">XHTML</a> document.
188 * <p>
189 * This is equivalent to:<br />
190 * {@link #isSyntacticalEmptyElementTag()}<code> && !(</code>{@link HTMLElements#getEndTagOptionalElementNames()}<code>.contains(</code>{@link #getName() getName()}<code>) || </code>{@link HTMLElements#getEndTagRequiredElementNames()}<code>.contains(</code>{@link #getName() getName()}<code>))</code>.
191 * <p>
192 * You can set the static {@link Config#IsHTMLEmptyElementTagRecognised} property to <code>true</code> to force the parser to recognise all empty-element tags,
193 * making this method is exactly equivalent to {@link #isSyntacticalEmptyElementTag()}.
194 *
195 * @return <code>true</code> if this start tag is an <a target="_blank" href="http://www.w3.org/TR/REC-xml#dt-eetag">empty-element tag</a>, otherwise <code>false</code>.
196 */
197 public boolean isEmptyElementTag() {
198 return isSyntacticalEmptyElementTag() && !HTMLElements.isClosingSlashIgnored(name);
199 }
200
201 /**
202 * Indicates whether this start tag is syntactically an <a target="_blank" href="http://www.w3.org/TR/REC-xml#dt-eetag">empty-element tag</a>.
203 * <p>
204 * This is signified by the characters "/&gt;" at the end of the start tag.
205 * <p>
206 * Only a {@linkplain StartTagType#NORMAL normal} start tag can be syntactically an empty-element tag.
207 * <p>
208 * This property simply reports whether the syntax of the start tag is consistent with that of an empty-element tag,
209 * it does not guarantee that this start tag's {@linkplain #getElement() element} is actually {@linkplain Element#isEmpty() empty}.
210 * <p>
211 * This possible discrepancy reflects the way major browsers interpret illegal empty element tags used in
212 * <a href="HTMLElements.html#HTMLElement">HTML elements</a>, and is explained further in the documentation of the
213 * {@link #isEmptyElementTag()} property.
214 *
215 * @return <code>true</code> if this start tag is syntactically an <a target="_blank" href="http://www.w3.org/TR/REC-xml#dt-eetag">empty-element tag</a>, otherwise <code>false</code>.
216 * @see #isEmptyElementTag()
217 */
218 public boolean isSyntacticalEmptyElementTag() {
219 return startTagType==StartTagType.NORMAL && source.charAt(end-2)=='/';
220 }
221
222 /**
223 * Returns the {@linkplain StartTagType type} of this start tag.
224 * <p>
225 * This is equivalent to <code>(StartTagType)</code>{@link #getTagType()}.
226 *
227 * @return the {@linkplain StartTagType type} of this start tag.
228 */
229 public StartTagType getStartTagType() {
230 return startTagType;
231 }
232
233 // Documentation inherited from Tag
234 public TagType getTagType() {
235 return startTagType;
236 }
237
238 /**
239 * Returns the attributes specified in this start tag.
240 * <p>
241 * Return value is not <code>null</code> if and only if
242 * {@link #getStartTagType()}<code>.</code>{@link StartTagType#hasAttributes() hasAttributes()}<code>==true</code>.
243 * <p>
244 * To force the parsing of attributes in other start tag types, use the {@link #parseAttributes()} method instead.
245 *
246 * @return the attributes specified in this start tag, or <code>null</code> if the {@linkplain #getStartTagType() type} of this start tag does not {@linkplain StartTagType#hasAttributes() have attributes}.
247 * @see #parseAttributes()
248 * @see Source#parseAttributes(int pos, int maxEnd)
249 */
250 public Attributes getAttributes() {
251 return attributes;
252 }
253
254 /**
255 * Returns the {@linkplain CharacterReference#decode(CharSequence) decoded} value of the attribute with the specified name (case insensitive).
256 * <p>
257 * Returns <code>null</code> if this start tag does not {@linkplain StartTagType#hasAttributes() have attributes},
258 * no attribute with the specified name exists or the attribute {@linkplain Attribute#hasValue() has no value}.
259 * <p>
260 * This is equivalent to {@link #getAttributes()}<code>.</code>{@link Attributes#getValue(String) getValue(attributeName)},
261 * except that it returns <code>null</code> if this start tag does not have attributes instead of throwing a
262 * <code>NullPointerException</code>.
263 *
264 * @param attributeName the name of the attribute to get.
265 * @return the {@linkplain CharacterReference#decode(CharSequence) decoded} value of the attribute with the specified name, or <code>null</code> if the attribute does not exist or {@linkplain Attribute#hasValue() has no value}.
266 */
267 public String getAttributeValue(final String attributeName) {
268 return attributes==null ? null : attributes.getValue(attributeName);
269 }
270
271 /**
272 * Parses the attributes specified in this start tag, regardless of the type of start tag.
273 * This method is only required in the unusual situation where attributes exist in a start tag whose
274 * {@linkplain #getStartTagType() type} doesn't {@linkplain StartTagType#hasAttributes() have attributes}.
275 * <p>
276 * This method returns the cached attributes from the {@link StartTag#getAttributes()} method
277 * if its value is not <code>null</code>, otherwise the source is physically parsed with each call to this method.
278 * <p>
279 * This is equivalent to {@link #parseAttributes(int) parseAttributes}<code>(</code>{@link Attributes#getDefaultMaxErrorCount()}<code>)}</code>.
280 *
281 * @return the attributes specified in this start tag, or <code>null</code> if too many errors occur while parsing.
282 * @see #getAttributes()
283 * @see Source#parseAttributes(int pos, int maxEnd)
284 */
285 public Attributes parseAttributes() {
286 return parseAttributes(Attributes.getDefaultMaxErrorCount());
287 }
288
289 /**
290 * Parses the attributes specified in this start tag, regardless of the type of start tag.
291 * This method is only required in the unusual situation where attributes exist in a start tag whose
292 * {@linkplain #getStartTagType() type} doesn't {@linkplain StartTagType#hasAttributes() have attributes}.
293 * <p>
294 * See the documentation of the {@link #parseAttributes()} method for more information.
295 *
296 * @param maxErrorCount the maximum number of minor errors allowed while parsing
297 * @return the attributes specified in this start tag, or <code>null</code> if too many errors occur while parsing.
298 * @see #getAttributes()
299 */
300 public Attributes parseAttributes(final int maxErrorCount) {
301 if (attributes!=null) return attributes;
302 final int maxEnd=end-startTagType.getClosingDelimiter().length();
303 int attributesBegin=begin+1+name.length();
304 // skip any non-name characters directly after the name (which are quite common)
305 while (!isXMLNameStartChar(source.charAt(attributesBegin))) {
306 attributesBegin++;
307 if (attributesBegin==maxEnd) return null;
308 }
309 Attributes attributes=Attributes.construct(source,begin,attributesBegin,maxEnd,startTagType,name,maxErrorCount);
310 if (attributes!=null) attributes.setStartTag(this);
311 return attributes;
312 }
313
314 /**
315 * Returns the segment between the end of the tag's {@linkplain #getName() name} and the start of its <a href="#EndDelimiter">end delimiter</a>.
316 * <p>
317 * This method is normally only of use for start tags whose content is something other than {@linkplain #getAttributes() attributes}.
318 * <p>
319 * A new {@link Segment} object is created with each call to this method.
320 *
321 * @return the segment between the end of the tag's {@linkplain #getName() name} and the start of the <a href="#EndDelimiter">end delimiter</a>.
322 */
323 public Segment getTagContent() {
324 return new Segment(source,begin+1+name.length(),end-startTagType.getClosingDelimiter().length());
325 }
326
327 /**
328 * Returns the {@link FormControl} defined by this start tag.
329 * <p>
330 * This is equivalent to {@link #getElement()}<code>.</code>{@link Element#getFormControl() getFormControl()}.
331 *
332 * @return the {@link FormControl} defined by this start tag, or <code>null</code> if it is not a <a target="_blank" href="http://www.w3.org/TR/html401/interact/forms.html#form-controls">control</a>.
333 */
334 public FormControl getFormControl() {
335 return getElement().getFormControl();
336 }
337
338 /**
339 * Indicates whether a matching end tag is forbidden.
340 * <p>
341 * This property returns <code>true</code> if one of the following conditions is met:
342 * <ul>
343 * <li>The {@linkplain #getStartTagType() type} of this start tag does not specify a
344 * {@linkplain StartTagType#getCorrespondingEndTagType() corresponding end tag type}.
345 * <li>The {@linkplain #getName() name} of this start tag indicates it is the start of an
346 * <a href="Element.html#HTML">HTML element</a> whose {@linkplain HTMLElements#getEndTagForbiddenElementNames() end tag is forbidden}.
347 * <li>This start tag is {@linkplain #isSyntacticalEmptyElementTag() syntactically an empty-element tag} and its
348 * {@linkplain #getName() name} indicates it is the start of a <a href="HTMLElements.html#NonHTMLElement">non-HTML element</a>.
349 * </ul>
350 * <p>
351 * If this property returns <code>true</code> then this start tag's {@linkplain #getElement() element} will always be a
352 * <a href="Element.html#SingleTag">single tag element</a>.
353 *
354 * @return <code>true</code> if a matching end tag is forbidden, otherwise <code>false</code>.
355 */
356 public boolean isEndTagForbidden() {
357 if (getStartTagType()!=StartTagType.NORMAL)
358 return getStartTagType().getCorrespondingEndTagType()==null;
359 if (HTMLElements.getEndTagForbiddenElementNames().contains(name)) return true;
360 if (HTMLElements.getElementNames().contains(name)) return false;
361 return isSyntacticalEmptyElementTag();
362 }
363
364 /**
365 * Indicates whether a matching end tag is required.
366 * <p>
367 * This property returns <code>true</code> if one of the following conditions is met:
368 * <ul>
369 * <li>The {@linkplain #getStartTagType() type} of this start tag is NOT {@link StartTagType#NORMAL}, but specifies a
370 * {@linkplain StartTagType#getCorrespondingEndTagType() corresponding end tag type}.
371 * <li>The {@linkplain #getName() name} of this start tag indicates it is the start of an
372 * <a href="Element.html#HTML">HTML element</a> whose {@linkplain HTMLElements#getEndTagRequiredElementNames() end tag is required}.
373 * <li>This start tag is NOT {@linkplain #isSyntacticalEmptyElementTag() syntactically an empty-element tag} and its
374 * {@linkplain #getName() name} indicates it is the start of a <a href="HTMLElements.html#NonHTMLElement">non-HTML element</a>.
375 * </ul>
376 *
377 * @return <code>true</code> if a matching end tag is required, otherwise <code>false</code>.
378 */
379 public boolean isEndTagRequired() {
380 if (getStartTagType()!=StartTagType.NORMAL)
381 return getStartTagType().getCorrespondingEndTagType()!=null;
382 if (HTMLElements.getEndTagRequiredElementNames().contains(name)) return true;
383 if (HTMLElements.getElementNames().contains(name)) return false;
384 return !isSyntacticalEmptyElementTag();
385 }
386
387 // Documentation inherited from Tag
388 public boolean isUnregistered() {
389 return startTagType==StartTagType.UNREGISTERED;
390 }
391
392 /**
393 * Returns an XML representation of this start tag.
394 * <p>
395 * This is equivalent to {@link #tidy(boolean) tidy(false)}, thereby keeping the {@linkplain #getName() name} of the tag in its original case.
396 * <p>
397 * See the documentation of the {@link #tidy(boolean toXHTML)} method for more details.
398 *
399 * @return an XML representation of this start tag, or the {@linkplain Segment#toString() source text} if it is of a {@linkplain #getStartTagType() type} that does not {@linkplain StartTagType#hasAttributes() have attributes}.
400 */
401 public String tidy() {
402 return tidy(false);
403 }
404
405 /**
406 * Returns an XML or XHTML representation of this start tag.
407 * <p>
408 * The tidying of the tag is carried out as follows:
409 * <ul>
410 * <li>if this start tag is of a {@linkplain #getStartTagType() type} that does not {@linkplain StartTagType#hasAttributes() have attributes},
411 * then the original {@linkplain Segment#toString() source text} of the enture tag is returned.
412 * <li>if this start tag contain any {@linkplain TagType#isServerTag() server tags} outside of an attribute value,
413 * then the original {@linkplain Segment#toString() source text} of the entire tag is returned.
414 * <li>name converted to lower case if the <code>toXHTML</code> argument is <code>true</code> and this is a {@linkplain StartTagType#NORMAL normal} start tag
415 * <li>attributes separated by a single space
416 * <li>attribute names in original case
417 * <li>attribute values are enclosed in double quotes and {@linkplain CharacterReference#reencode(CharSequence) re-encoded}
418 * <li>if this start tag forms an <a href="Element.html#HTML">HTML element</a> that has no {@linkplain Element#getEndTag() end tag},
419 * a slash is inserted before the closing angle bracket, separated from the {@linkplain #getName() name} or last attribute by a single space.
420 * <li>if an attribute value contains a {@linkplain TagType#isServerTag() server tag} it is inserted verbatim instead of being
421 * {@linkplain CharacterReference#encode(CharSequence) encoded}.
422 * </ul>
423 * <p>
424 * The <code>toXHTML</code> parameter determines only whether the name is converted to lower case for {@linkplain StartTagType#NORMAL normal} tags.
425 * In all other respects the generated tag is already valid XHTML.
426 * <p>
427 * <dl>
428 * <dt>Example:</dt>
429 * <dd>
430 * <p>
431 * The following source text:
432 * <blockquote class="code">
433 * <code>&lt;INPUT name=Company value='G&amp;uuml;nter O&amp#39;Reilly &amp;amp Associ&eacute;s'&gt;</code>
434 * </blockquote>
435 * produces the following regenerated HTML:
436 * <blockquote class="code">
437 * <code>&lt;input name="Company" value="G&amp;uuml;nter O'Reilly &amp;amp; Associ&amp;eacute;s" /&gt;</code>
438 * </blockquote>
439 * </dd>
440 * </dl>
441 *
442 * @param toXHTML specifies whether the output is XHTML.
443 * @return an XML or XHTML representation of this start tag, or the {@linkplain Segment#toString() source text} if it is of a {@linkplain #getStartTagType() type} that does not {@linkplain StartTagType#hasAttributes() have attributes}.
444 */
445 public String tidy(boolean toXHTML) {
446 if (attributes==null || attributes.containsServerTagOutsideOfAttributeValue) return toString();
447 final StringBuilder sb=new StringBuilder();
448 sb.append('<');
449 if (toXHTML && startTagType==StartTagType.NORMAL) {
450 sb.append(name);
451 } else {
452 int i=begin+startTagType.startDelimiterPrefix.length();
453 final int nameSegmentEnd=i+name.length();
454 while (i<nameSegmentEnd) {
455 sb.append(source.charAt(i));
456 i++;
457 }
458 }
459 try {
460 attributes.appendTidy(sb,getNextTag());
461 } catch (IOException ex) {throw new RuntimeException(ex);} // never happens
462 if (startTagType==StartTagType.NORMAL && getElement().getEndTag()==null && !HTMLElements.getEndTagOptionalElementNames().contains(name)) sb.append(" /");
463 sb.append(startTagType.getClosingDelimiter());
464 return sb.toString();
465 }
466
467 /**
468 * Generates the HTML text of a {@linkplain StartTagType#NORMAL normal} start tag with the specified tag name and {@linkplain Attributes#populateMap(Map,boolean) attributes map}.
469 * <p>
470 * The output of the attributes is as described in the {@link Attributes#generateHTML(Map attributesMap)} method.
471 * <p>
472 * The <code>emptyElementTag</code> parameter specifies whether the start tag should be an
473 * <a target="_blank" href="http://www.w3.org/TR/REC-xml#dt-eetag">empty-element tag</a>,
474 * in which case a slash is inserted before the closing angle bracket, separated from the name
475 * or last attribute by a single space.
476 * <p>
477 * <dl>
478 * <dt>Example:</dt>
479 * <dd>
480 * <p>
481 * The following code:
482 * <blockquote class="code">
483 * <pre>
484 * LinkedHashMap attributesMap=new LinkedHashMap();
485 * attributesMap.put("name","Company");
486 * attributesMap.put("value","G\n00fcnter O'Reilly & Associ&eacute;s");
487 * System.out.println(StartTag.generateHTML("INPUT",attributesMap,true));</pre>
488 * </blockquote>
489 * generates the following output:
490 * <blockquote class="code">
491 * <code>&lt;INPUT name="Company" value="G&amp;uuml;nter O'Reilly &amp;amp; Associ&amp;eacute;s" /&gt;</code>
492 * </blockquote>
493 * </dd>
494 * </dl>
495 *
496 * @param tagName the name of the start tag.
497 * @param attributesMap a map containing attribute name/value pairs.
498 * @param emptyElementTag specifies whether the start tag should be an <a target="_blank" href="http://www.w3.org/TR/REC-xml#dt-eetag">empty-element tag</a>.
499 * @return the HTML text of a {@linkplain StartTagType#NORMAL normal} start tag with the specified tag name and {@linkplain Attributes#populateMap(Map,boolean) attributes map}.
500 * @see EndTag#generateHTML(String tagName)
501 */
502 public static String generateHTML(final String tagName, final Map<String,String> attributesMap, final boolean emptyElementTag) {
503 final StringBuilder sb=new StringBuilder();
504 sb.append('<').append(tagName);
505 try {
506 Attributes.appendHTML(sb,attributesMap);
507 } catch (IOException ex) {throw new RuntimeException(ex);} // never happens
508 if (emptyElementTag)
509 sb.append(" />");
510 else
511 sb.append('>');
512 return sb.toString();
513 }
514
515 public String getDebugInfo() {
516 final StringBuilder sb=new StringBuilder();
517 appendDebugTag(sb);
518 sb.append(' ');
519 appendDebugTagType(sb);
520 sb.append(super.getDebugInfo());
521 return sb.toString();
522 }
523
524 StringBuilder appendDebugTag(final StringBuilder sb) {
525 if (startTagType==StartTagType.NORMAL && getAttributes().isEmpty()) {
526 sb.append(this);
527 } else {
528 sb.append('<').append(getNameSegment()).append(' ');
529 if (isSyntacticalEmptyElementTag()) sb.append('/');
530 sb.append(startTagType.getClosingDelimiter());
531 }
532 return sb;
533 }
534
535 StringBuilder appendDebugTagType(final StringBuilder sb) {
536 if (startTagType!=StartTagType.NORMAL) sb.append('(').append(startTagType.getDescription()).append(") ");
537 return sb;
538 }
539
540 private EndTag getEndTagInternal() {
541 boolean checkForEmptyElementTag=true;
542 // A missing optional end tag returns a zero length EndTag instead of null
543 final EndTagType endTagType=startTagType.getCorrespondingEndTagType();
544 if (startTagType==StartTagType.NORMAL) {
545 checkForEmptyElementTag=!HTMLElements.isClosingSlashIgnored(name); // check for empty-element tags if tag is not an HTML element
546 if (checkForEmptyElementTag && isSyntacticalEmptyElementTag()) // non-html empty-element tag
547 return null;
548 if (HTMLElements.getEndTagForbiddenElementNames().contains(name)) // end tag is forbidden
549 return null; // *** maybe add option to look for end tag if parsing strict XML?
550 final HTMLElementTerminatingTagNameSets terminatingTagNameSets=HTMLElements.getTerminatingTagNameSets(name);
551 if (terminatingTagNameSets!=null) // end tag is optional
552 return getOptionalEndTag(terminatingTagNameSets);
553 } else if (endTagType==null) {
554 return null;
555 }
556 // This is either a start tag type other than NORMAL that requires an end tag, or an HTML element tag that requires an end tag,
557 // or a non-HTML element tag that is not an empty-element tag.
558 // In all of these cases the end tag is required.
559 final EndTag nextEndTag=source.getNextEndTag(end,endTagType.getEndTagName(name),endTagType);
560 if (nextEndTag!=null) {
561 if (startTagType==StartTagType.NORMAL && HTMLElements.END_TAG_REQUIRED_NESTING_FORBIDDEN_SET.contains(name)) {
562 final StartTag nextStartTag=source.getNextStartTag(end,name);
563 if (nextStartTag==null || nextStartTag.begin>nextEndTag.begin) return nextEndTag;
564 if (source.logger.isInfoEnabled()) source.logger.info(source.getRowColumnVector(begin).appendTo(new StringBuilder(200).append("StartTag at ")).append(" missing required end tag - invalid nested start tag encountered before end tag").toString());
565 // Terminate the element at the start of the invalidly nested start tag.
566 // This is how IE and Mozilla treat illegally nested A elements, but other elements may vary.
567 return new EndTag(source,nextStartTag.begin,nextStartTag.begin,EndTagType.NORMAL,name);
568 }
569 final Segment[] getResult=getEndTag(nextEndTag,checkForEmptyElementTag,Tag.isXMLName(name));
570 if (getResult!=null) return (EndTag)getResult[0];
571 }
572 if (source.logger.isInfoEnabled()) source.logger.info(source.getRowColumnVector(begin).appendTo(new StringBuilder(200).append("StartTag at ")).append(" missing required end tag").toString());
573 return null;
574 }
575
576 private EndTag getOptionalEndTag(final HTMLElementTerminatingTagNameSets terminatingTagNameSets) {
577 int pos=end;
578 while (pos<source.end) {
579 final Tag tag=Tag.getNextTag(source,pos);
580 if (tag==null) break;
581 Set terminatingTagNameSet;
582 if (tag instanceof EndTag) {
583 if (tag.name==name) return (EndTag)tag;
584 terminatingTagNameSet=terminatingTagNameSets.TerminatingEndTagNameSet;
585 } else {
586 terminatingTagNameSet=terminatingTagNameSets.NonterminatingElementNameSet;
587 if (terminatingTagNameSet!=null && terminatingTagNameSet.contains(tag.name)) {
588 Element nonterminatingElement=((StartTag)tag).getElement();
589 pos=nonterminatingElement.end;
590 continue;
591 }
592 terminatingTagNameSet=terminatingTagNameSets.TerminatingStartTagNameSet;
593 }
594 if (terminatingTagNameSet!=null && terminatingTagNameSet.contains(tag.name)) return new EndTag(source,tag.begin,tag.begin,EndTagType.NORMAL,name);
595 pos=tag.begin+1;
596 }
597 // Ran out of tags. The only legitimate case of this happening is if the HTML end tag is missing, in which case the end of the element is the end of the source document
598 return new EndTag(source,source.end,source.end,EndTagType.NORMAL,name);
599 }
600
601 static String getStartDelimiter(final String searchName) {
602 if (searchName.length()==0) throw new IllegalArgumentException("searchName argument must not be zero length");
603 final String startDelimiter=StartTagType.START_DELIMITER_PREFIX+searchName;
604 if (startDelimiter.charAt(StartTagType.START_DELIMITER_PREFIX.length())=='/') throw new IllegalArgumentException("searchName argument \""+searchName+"\" must not start with '/'");
605 return startDelimiter;
606 }
607
608 static StartTag getPrevious(final Source source, final int pos, final String searchName, final StartTagType searchStartTagType) {
609 return getPrevious(source,pos,searchName,searchStartTagType,searchStartTagType==StartTagType.NORMAL ? Tag.isXMLName(searchName) : true);
610 }
611
612 static StartTag getPrevious(final Source source, final int pos, final String searchName, final StartTagType searchStartTagType, final boolean isXMLTagName) {
613 // searchName is already in lower case, but may be null
614 // searchStartTagType must not be null
615 // isXMLTagName is only used if searchStartTagType==StartTagType.NORMAL
616 if (searchName==null) return (StartTag)source.getPreviousTag(pos,searchStartTagType);
617 final String startDelimiter=getStartDelimiter(searchName);
618 try {
619 final ParseText parseText=source.getParseText();
620 int begin=pos;
621 do {
622 begin=parseText.lastIndexOf(startDelimiter,begin);
623 if (begin==-1) return null;
624 final StartTag startTag=(StartTag)Tag.getTagAt(source,begin,false);
625 if (startTag==null) continue; // keep looking if it wasn't a start tag
626 if (searchStartTagType!=startTag.getStartTagType()) {
627 // The start tag is of the wrong type. The only case in which we want to return it is if
628 // we are looking for a normal start tag, the found start tag is unregistered, and the search name is NOT a valid XML name.
629 // This allows users to search for some types of unregistered tags by name rather than having to register custom tag types.
630 if (searchStartTagType!=StartTagType.NORMAL || isXMLTagName || !startTag.isUnregistered()) continue;
631 }
632 if (startTag.getStartTagType().isNameAfterPrefixRequired() && startTag.getName().length()>searchName.length()) {
633 // The name of the start tag is longer than the search name, and the type of tag indicates
634 // that we are probably looking for an exact match.
635 // (eg searchName="a", startTag.name="applet" -> reject)
636 // We only require an exact match if the last character of the search name is part of the name, as the
637 // search name might be just the prefix of a server tag.
638 // (eg searchName="?", startTag.name="?abc" -> accept, but searchName="?a", startTag.name="?abc" -> reject)
639 // The only exception to this is if the last character of the search name is a colon (which also forms part of
640 // the name), but signifies that we want to search on the entire namespace.
641 // (eg searchName="o:", startTag.name="o:p" -> accept)
642 char lastSearchNameChar=searchName.charAt(searchName.length()-1);
643 if (lastSearchNameChar!=':' && isXMLNameChar(lastSearchNameChar)) continue;
644 }
645 return startTag;
646 } while ((begin-=2)>=0);
647 } catch (IndexOutOfBoundsException ex) {
648 // this should never happen during a get previous operation so rethrow it:
649 throw ex;
650 }
651 return null;
652 }
653
654 static StartTag getNext(final Source source, final int pos, final String searchName, final StartTagType searchStartTagType) {
655 return getNext(source,pos,searchName,searchStartTagType,searchStartTagType==StartTagType.NORMAL ? Tag.isXMLName(searchName) : true);
656 }
657
658 static StartTag getNext(final Source source, final int pos, final String searchName, final StartTagType searchStartTagType, final boolean isXMLTagName) {
659 // searchName is already in lower case, but may be null
660 // searchStartTagType must not be null
661 // isXMLTagName is only used if searchStartTagType==StartTagType.NORMAL
662 if (searchName==null) return (StartTag)source.getNextTag(pos,searchStartTagType);
663 final String startDelimiter=getStartDelimiter(searchName);
664 try {
665 final ParseText parseText=source.getParseText();
666 int begin=pos;
667 do {
668 begin=parseText.indexOf(startDelimiter,begin);
669 if (begin==-1) return null;
670 final StartTag startTag=(StartTag)Tag.getTagAt(source,begin,false);
671 if (startTag==null) continue; // keep looking if it wasn't a start tag
672 if (searchStartTagType!=startTag.getStartTagType()) {
673 // The start tag is of the wrong type. The only case in which we want to return it is if
674 // we are looking for a normal start tag, the found start tag is unregistered, and the search name is NOT a valid XML name.
675 // This allows users to search for some types of unregistered tags by name rather than having to register custom tag types.
676 if (searchStartTagType!=StartTagType.NORMAL || isXMLTagName || !startTag.isUnregistered()) continue;
677 }
678 if (startTag.getStartTagType().isNameAfterPrefixRequired() && startTag.getName().length()>searchName.length()) {
679 // The name of the start tag is longer than the search name, and the type of tag indicates
680 // that we are probably looking for an exact match.
681 // (eg searchName="a", startTag.name="applet" -> reject)
682 // We only require an exact match if the last character of the search name is part of the name, as the
683 // search name might be just the prefix of a server tag.
684 // (eg searchName="?", startTag.name="?abc" -> accept, but searchName="?a", startTag.name="?abc" -> reject)
685 // The only exception to this is if the last character of the search name is a colon (which also forms part of
686 // the name), but signifies that we want to search on the entire namespace.
687 // (eg searchName="o:", startTag.name="o:p" -> accept)
688 char lastSearchNameChar=searchName.charAt(searchName.length()-1);
689 if (lastSearchNameChar!=':' && isXMLNameChar(lastSearchNameChar)) continue;
690 }
691 return startTag;
692 } while ((begin+=1)<source.end);
693 } catch (IndexOutOfBoundsException ex) {
694 // this should only happen when the end of file is reached in the middle of a tag.
695 // we don't have to do anything to handle it as there are no more tags anyway.
696 }
697 return null;
698 }
699
700 static StartTag getPrevious(final Source source, int pos) {
701 Tag tag=Tag.getPreviousTag(source,pos);
702 if (tag==null) return null;
703 if (tag instanceof StartTag) return (StartTag)tag;
704 return tag.getPreviousStartTag();
705 }
706
707 static StartTag getNext(final Source source, int pos) {
708 Tag tag=Tag.getNextTag(source,pos);
709 if (tag==null) return null;
710 if (tag instanceof StartTag) return (StartTag)tag;
711 return tag.getNextStartTag();
712 }
713
714 static StartTag getNext(final Source source, final int pos, final String attributeName, final String value, final boolean valueCaseSensitive) {
715 if (value==null || attributeName.length()==0) throw new IllegalArgumentException();
716 // Determine whether to perform the text search on the name or value:
717 // - perform the text search on the value if it is >= 3 chars long.
718 // - have to perform the text search on the name if the value is zero length.
719 // - perform the text search on the name if the name >= 3 chars long, otherwise on the value.
720 final String searchString=value.length()>=3 || (value.length()>0 && attributeName.length()<3) ? value : attributeName;
721 final ParseText parseText=source.getParseText();
722 int searchPos=pos;
723 while (searchPos<source.end) {
724 searchPos=parseText.indexOf(searchString.toLowerCase(),searchPos);
725 if (searchPos==-1) return null;
726 final Tag tag=source.getEnclosingTag(searchPos);
727 if (tag==null || !(tag instanceof StartTag)) {
728 searchPos++;
729 continue;
730 }
731 if (tag.begin>=pos) {
732 final StartTag startTag=(StartTag)tag;
733 if (startTag.getAttributes()!=null) {
734 final String attributeValue=startTag.getAttributes().getValue(attributeName);
735 if (attributeValue!=null) {
736 if (value.equals(attributeValue)) return startTag;
737 if (value.equalsIgnoreCase(attributeValue)) {
738 if (!valueCaseSensitive) return startTag;
739 if (source.logger.isInfoEnabled()) source.logger.info(source.getRowColumnVector(searchPos).appendTo(new StringBuilder(200)).append(": StartTag with attribute ").append(attributeName).append("=\"").append(attributeValue).append("\" ignored during search because its case does not match search value \"").append(value).append('"').toString());
740 }
741 }
742 }
743 }
744 searchPos=tag.end;
745 }
746 return null;
747 }
748
749 static StartTag getNext(final Source source, final int pos, final String attributeName, final Pattern regexPattern) {
750 if (attributeName==null || attributeName.length()==0) throw new IllegalArgumentException();
751 final String searchString=attributeName;
752 final ParseText parseText=source.getParseText();
753 int searchPos=pos;
754 while (searchPos<source.end) {
755 searchPos=parseText.indexOf(searchString.toLowerCase(),searchPos);
756 if (searchPos==-1) return null;
757 final Tag tag=source.getEnclosingTag(searchPos);
758 if (tag==null || !(tag instanceof StartTag)) {
759 searchPos++;
760 continue;
761 }
762 if (tag.begin>=pos) {
763 final StartTag startTag=(StartTag)tag;
764 if (startTag.getAttributes()!=null) {
765 final Attribute attribute=startTag.getAttributes().get(attributeName);
766 if (attribute!=null) {
767 if (regexPattern==null) return startTag;
768 final String attributeValue=attribute.getValue();
769 if (attributeValue!=null && regexPattern.matcher(attributeValue).matches()) return startTag;
770 }
771 }
772 }
773 searchPos=tag.end;
774 }
775 return null;
776 }
777
778 private Segment[] getEndTag(final EndTag nextEndTag, final boolean checkForEmptyElementTag, final boolean isXMLTagName) {
779 assert nextEndTag!=null;
780 StartTag nextStartTag=getNext(source,end,name,startTagType,isXMLTagName);
781 if (checkForEmptyElementTag) {
782 while (nextStartTag!=null && nextStartTag.isSyntacticalEmptyElementTag())
783 nextStartTag=getNext(source,nextStartTag.end,name,startTagType,isXMLTagName);
784 }
785 return getEndTag(end,nextStartTag,nextEndTag,checkForEmptyElementTag,isXMLTagName);
786 }
787
788 private Segment[] getEndTag(final int afterPos, final StartTag nextStartTag, final EndTag nextEndTag, final boolean checkForEmptyElementTag, final boolean isXMLTagName) {
789 // returns null if no end tag exists in the rest of the file, otherwise the following two segments:
790 // first is the matching end tag to this start tag. Must be present if array is returned.
791 // second is the next occurrence after the returned end tag of a start tag of the same name. (null if none exists)
792 if (nextEndTag==null) return null; // no end tag in the rest of the file
793 final Segment[] returnArray={nextEndTag,nextStartTag};
794 if (nextStartTag==null || nextStartTag.begin>nextEndTag.begin) return returnArray; // no more start tags of the same name in rest of file, or they occur after the end tag that we found. This means we have found the matching end tag.
795 final Segment[] getResult=nextStartTag.getEndTag(nextEndTag,checkForEmptyElementTag,isXMLTagName); // get the matching end tag to the interloping start tag
796 if (getResult==null) return null; // no end tag in the rest of the file
797 final EndTag nextStartTagsEndTag=(EndTag)getResult[0];
798 final EndTag nextNextEndTag=EndTag.getNext(source,nextStartTagsEndTag.end,nextEndTag.getName(),nextEndTag.getEndTagType()); // get end tag after the interloping start tag's end tag
799 return getEndTag(nextStartTagsEndTag.end,(StartTag)getResult[1],nextNextEndTag,checkForEmptyElementTag,isXMLTagName); // recurse to see if this is the matching end tag
800 }
801 }

   
Visit the aagtl Website