1 |
// Jericho HTML Parser - Java based library for analysing and manipulating HTML
|
2 |
// Version 3.2
|
3 |
// Copyright (C) 2004-2009 Martin Jericho
|
4 |
// http://jericho.htmlparser.net/
|
5 |
//
|
6 |
// This library is free software; you can redistribute it and/or
|
7 |
// modify it under the terms of either one of the following licences:
|
8 |
//
|
9 |
// 1. The Eclipse Public License (EPL) version 1.0,
|
10 |
// included in this distribution in the file licence-epl-1.0.html
|
11 |
// or available at http://www.eclipse.org/legal/epl-v10.html
|
12 |
//
|
13 |
// 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
|
14 |
// included in this distribution in the file licence-lgpl-2.1.txt
|
15 |
// or available at http://www.gnu.org/licenses/lgpl.txt
|
16 |
//
|
17 |
// This library is distributed on an "AS IS" basis,
|
18 |
// WITHOUT WARRANTY OF ANY KIND, either express or implied.
|
19 |
// See the individual licence texts for more details.
|
20 |
|
21 |
package net.htmlparser.jericho;
|
22 |
|
23 |
import java.util.*;
|
24 |
|
25 |
/**
|
26 |
* Represents an <a target="_blank" href="http://www.w3.org/TR/html401/intro/sgmltut.html#h-3.2.1">element</a>
|
27 |
* in a specific {@linkplain Source source} document, which encompasses a {@linkplain #getStartTag() start tag},
|
28 |
* an optional {@linkplain #getEndTag() end tag} and all {@linkplain #getContent() content} in between.
|
29 |
* <p>
|
30 |
* Take the following HTML segment as an example:
|
31 |
* <p>
|
32 |
* <code><p>This is a sample paragraph.</p></code>
|
33 |
* <p>
|
34 |
* The whole segment is represented by an <code>Element</code> object. This is comprised of the {@link StartTag} "<code><p></code>",
|
35 |
* the {@link EndTag} "<code></p></code>", as well as the text in between.
|
36 |
* An element may also contain other elements between its start and end tags.
|
37 |
* <p>
|
38 |
* The term <i><a name="Normal">normal element</a></i> refers to an element having a {@linkplain #getStartTag() start tag}
|
39 |
* with a {@linkplain StartTag#getStartTagType() type} of {@link StartTagType#NORMAL}.
|
40 |
* This comprises all {@linkplain HTMLElements HTML elements} and <a href="HTMLElements.html#NonHTMLElement">non-HTML elements</a>.
|
41 |
* <p>
|
42 |
* <code>Element</code> instances are obtained using one of the following methods:
|
43 |
* <ul>
|
44 |
* <li>{@link StartTag#getElement()}
|
45 |
* <li>{@link EndTag#getElement()}
|
46 |
* <li>{@link Segment#getAllElements()}
|
47 |
* <li>{@link Segment#getAllElements(String name)}
|
48 |
* <li>{@link Segment#getAllElements(StartTagType)}
|
49 |
* </ul>
|
50 |
* See also the {@link HTMLElements} class, and the
|
51 |
* <a target="_blank" href="http://www.w3.org/TR/REC-xml#dt-element">XML 1.0 specification for elements</a>.
|
52 |
* <h3><a name="Structure">Element Structure</a></h3>
|
53 |
* <p>
|
54 |
* The three possible structures of an element are listed below:
|
55 |
* <dl class="Separated">
|
56 |
* <dt><a name="SingleTag">Single Tag Element</a>:
|
57 |
* <dd>
|
58 |
* Example:<br />
|
59 |
* <code><img src="mypicture.jpg"></code>
|
60 |
* <p>
|
61 |
* The element consists only of a single {@linkplain #getStartTag() start tag} and has no {@linkplain #getContent() element content}
|
62 |
* (although the start tag itself may have {@linkplain StartTag#getTagContent() tag content}).
|
63 |
* <br />{@link #getEndTag()}<code>==null</code>
|
64 |
* <br />{@link #isEmpty()}<code>==true</code>
|
65 |
* <br />{@link #getEnd() getEnd()}<code>==</code>{@link #getStartTag()}<code>.</code>{@link #getEnd() getEnd()}
|
66 |
* <p>
|
67 |
* This occurs in the following situations:
|
68 |
* <ul class="Unseparated">
|
69 |
* <li>An <a href="HTMLElements.html#HTMLElement">HTML element</a> for which the {@linkplain HTMLElements#getEndTagForbiddenElementNames() end tag is forbidden}.
|
70 |
* <li>An <a href="HTMLElements.html#HTMLElement">HTML element</a> for which the {@linkplain HTMLElements#getEndTagRequiredElementNames() end tag is required},
|
71 |
* but the end tag is not present in the source document.
|
72 |
* <li>An <a href="HTMLElements.html#HTMLElement">HTML element</a> for which the {@linkplain HTMLElements#getEndTagOptionalElementNames() end tag is optional},
|
73 |
* where the <a href="#ImplicitlyTerminated">implicitly terminating</a> tag is situated immediately after the element's
|
74 |
* {@linkplain #getStartTag() start tag}.
|
75 |
* <li>An {@linkplain #isEmptyElementTag() empty element tag}
|
76 |
* <li>A <a href="HTMLElements.html#NonHTMLElement">non-HTML element</a> that is not an {@linkplain #isEmptyElementTag() empty element tag} but is missing its end tag.
|
77 |
* <li>An element with a start tag of a {@linkplain StartTag#getStartTagType() type} that does not define a
|
78 |
* {@linkplain StartTagType#getCorrespondingEndTagType() corresponding end tag type}.
|
79 |
* <li>An element with a start tag of a {@linkplain StartTag#getStartTagType() type} that does define a
|
80 |
* {@linkplain StartTagType#getCorrespondingEndTagType() corresponding end tag type} but is missing its end tag.
|
81 |
* </ul>
|
82 |
* <dt><a name="ExplicitlyTerminated">Explicitly Terminated Element</a>:
|
83 |
* <dd>
|
84 |
* Example:<br />
|
85 |
* <code><p>This is a sample paragraph.</p></code>
|
86 |
* <p>
|
87 |
* The element consists of a {@linkplain #getStartTag() start tag}, {@linkplain #getContent() content},
|
88 |
* and an {@linkplain #getEndTag() end tag}.
|
89 |
* <br />{@link #getEndTag()}<code>!=null</code>.
|
90 |
* <br />{@link #isEmpty()}<code>==false</code> (provided the end tag doesn't immediately follow the start tag)
|
91 |
* <br />{@link #getEnd() getEnd()}<code>==</code>{@link #getEndTag()}<code>.</code>{@link #getEnd() getEnd()}.
|
92 |
* <p>
|
93 |
* This occurs in the following situations, assuming the start tag's matching end tag is present in the source document:
|
94 |
* <ul class="Unseparated">
|
95 |
* <li>An <a href="HTMLElements.html#HTMLElement">HTML element</a> for which the end tag is either
|
96 |
* {@linkplain HTMLElements#getEndTagRequiredElementNames() required} or {@linkplain HTMLElements#getEndTagOptionalElementNames() optional}.
|
97 |
* <li>A <a href="HTMLElements.html#NonHTMLElement">non-HTML element</a> that is not an {@linkplain #isEmptyElementTag() empty element tag}.
|
98 |
* <li>An element with a start tag of a {@linkplain StartTag#getStartTagType() type} that defines a
|
99 |
* {@linkplain StartTagType#getCorrespondingEndTagType() corresponding end tag type}.
|
100 |
* </ul>
|
101 |
* <dt><a name="ImplicitlyTerminated">Implicitly Terminated Element</a>:
|
102 |
* <dd>
|
103 |
* Example:<br />
|
104 |
* <code><p>This text is included in the paragraph element even though no end tag is present.</code><br />
|
105 |
* <code><p>This is the next paragraph.</code>
|
106 |
* <p>
|
107 |
* The element consists of a {@linkplain #getStartTag() start tag} and {@linkplain #getContent() content},
|
108 |
* but no {@linkplain #getEndTag() end tag}.
|
109 |
* <br />{@link #getEndTag()}<code>==null</code>.
|
110 |
* <br />{@link #isEmpty()}<code>==false</code>
|
111 |
* <br />{@link #getEnd() getEnd()}<code>!=</code>{@link #getStartTag()}<code>.</code>{@link #getEnd() getEnd()}.
|
112 |
* <p>
|
113 |
* This only occurs in an <a href="HTMLElements.html#HTMLElement">HTML element</a> for which the
|
114 |
* {@linkplain HTMLElements#getEndTagOptionalElementNames() end tag is optional}.
|
115 |
* <p>
|
116 |
* The element ends at the start of a tag which implies the termination of the element, called the <i>implicitly terminating tag</i>.
|
117 |
* If the implicitly terminating tag is situated immediately after the element's {@linkplain #getStartTag() start tag},
|
118 |
* the element is classed as a <a href="#SingleTag">single tag element</a>.
|
119 |
* <p>
|
120 |
* See the <a href="Element.html#ParsingRulesHTMLEndTagOptional">element parsing rules for HTML elements with optional end tags</a>
|
121 |
* for details on which tags can implicitly terminate a given element.
|
122 |
* <p>
|
123 |
* See also the documentation of the {@link HTMLElements#getEndTagOptionalElementNames()} method.
|
124 |
* </dl>
|
125 |
* <h3><a name="ParsingRules">Element Parsing Rules</a></h3>
|
126 |
* The following rules describe the algorithm used in the {@link StartTag#getElement()} method to construct an element.
|
127 |
* The detection of the start tag's matching end tag or other terminating tags always takes into account the possible nesting of elements.
|
128 |
* <p>
|
129 |
* <ul class="Separated">
|
130 |
* <li>
|
131 |
* If the start tag has a {@linkplain StartTag#getStartTagType() type} of {@link StartTagType#NORMAL}:
|
132 |
* <ul>
|
133 |
* <li>
|
134 |
* If the {@linkplain StartTag#getName() name} of the start tag matches one of the
|
135 |
* recognised {@linkplain HTMLElementName HTML element names} (indicating an <a href="HTMLElements.html#HTMLElement">HTML element</a>):
|
136 |
* <ul>
|
137 |
* <li>
|
138 |
* <a name="ParsingRulesHTMLEndTagForbidden"></a>
|
139 |
* If the end tag for an element of this {@linkplain StartTag#getName() name} is
|
140 |
* {@linkplain HTMLElements#getEndTagForbiddenElementNames() forbidden},
|
141 |
* the parser does not conduct any search for an end tag and a <a href="#SingleTag">single tag element</a> is created.
|
142 |
* <li>
|
143 |
* <a name="ParsingRulesHTMLEndTagRequired"></a>
|
144 |
* If the end tag for an element of this {@linkplain StartTag#getName() name} is
|
145 |
* {@linkplain HTMLElements#getEndTagRequiredElementNames() required}, the parser searches for the start tag's matching end tag.
|
146 |
* <ul class="Unseparated">
|
147 |
* <li>
|
148 |
* If the matching end tag is found, an <a href="#ExplicitlyTerminated">explicitly terminated element</a> is created.
|
149 |
* <li>
|
150 |
* If no matching end tag is found, the source document is not valid HTML and the incident is
|
151 |
* {@linkplain Source#getLogger() logged} as a missing required end tag.
|
152 |
* In this situation a <a href="#SingleTag">single tag element</a> is created.
|
153 |
* </ul>
|
154 |
* <li>
|
155 |
* <a name="ParsingRulesHTMLEndTagOptional"></a>
|
156 |
* If the end tag for an element of this {@linkplain StartTag#getName() name} is
|
157 |
* {@linkplain HTMLElements#getEndTagOptionalElementNames() optional}, the parser searches not only for the start tag's matching end tag,
|
158 |
* but also for any other tag that <a href="#ImplicitlyTerminated">implicitly terminates</a> the element.
|
159 |
* <br />For each tag (<i>T2</i>) following the start tag (<i>ST1</i>) of this element (<i>E1</i>):
|
160 |
* <ul class="Unseparated">
|
161 |
* <li>
|
162 |
* If <i>T2</i> is a start tag:
|
163 |
* <ul>
|
164 |
* <li>
|
165 |
* If the {@linkplain StartTag#getName() name} of <i>T2</i> is in the list of
|
166 |
* {@linkplain HTMLElements#getNonterminatingElementNames(String) non-terminating element names} for <i>E1</i>,
|
167 |
* then continue evaluating tags from the {@linkplain Element#getEnd() end} of <i>T2</i>'s corresponding
|
168 |
* {@linkplain StartTag#getElement() element}.
|
169 |
* <li>
|
170 |
* If the {@linkplain StartTag#getName() name} of <i>T2</i> is in the list of
|
171 |
* {@linkplain HTMLElements#getTerminatingStartTagNames(String) terminating start tag names} for <i>E1</i>,
|
172 |
* then <i>E1</i> ends at the {@linkplain StartTag#getBegin() beginning} of <i>T2</i>.
|
173 |
* If <i>T2</i> follows immediately after <i>ST1</i>, a <a href="#SingleTag">single tag element</a> is created,
|
174 |
* otherwise an <a href="#ImplicitlyTerminated">implicitly terminated element</a> is created.
|
175 |
* </ul>
|
176 |
* <li>
|
177 |
* If <i>T2</i> is an end tag:
|
178 |
* <ul>
|
179 |
* <li>
|
180 |
* If the {@linkplain EndTag#getName() name} of <i>T2</i> is the same as that of <i>ST1</i>,
|
181 |
* an <a href="#ExplicitlyTerminated">explicitly terminated element</a> is created.
|
182 |
* <li>
|
183 |
* If the {@linkplain EndTag#getName() name} of <i>T2</i> is in the list of
|
184 |
* {@linkplain HTMLElements#getTerminatingEndTagNames(String) terminating end tag names} for <i>E1</i>,
|
185 |
* then <i>E1</i> ends at the {@linkplain EndTag#getBegin() beginning} of <i>T2</i>.
|
186 |
* If <i>T2</i> follows immediately after <i>ST1</i>, a <a href="#SingleTag">single tag element</a> is created,
|
187 |
* otherwise an <a href="#ImplicitlyTerminated">implicitly terminated element</a> is created.
|
188 |
* </ul>
|
189 |
* <li>
|
190 |
* If no more tags are present in the source document, then <i>E1</i> ends at the end of the file, and an
|
191 |
* <a href="#ImplicitlyTerminated">implicitly terminated element</a> is created.
|
192 |
* </ul>
|
193 |
* </ul>
|
194 |
* Note that the syntactical indication of an {@linkplain StartTag#isSyntacticalEmptyElementTag() empty-element tag} in the start tag
|
195 |
* is ignored when determining the end of <a href="HTMLElements.html#HTMLElement">HTML elements</a>.
|
196 |
* See the documentation of the {@link #isEmptyElementTag()} method for more information.
|
197 |
* <li>
|
198 |
* If the {@linkplain StartTag#getName() name} of the start tag does not match one of the
|
199 |
* recognised {@linkplain HTMLElementName HTML element names} (indicating a <a href="HTMLElements.html#NonHTMLElement">non-HTML element</a>):
|
200 |
* <ul>
|
201 |
* <li>
|
202 |
* If the start tag is {@linkplain StartTag#isSyntacticalEmptyElementTag() syntactically an empty-element tag},
|
203 |
* the parser does not conduct any search for an end tag and a <a href="#SingleTag">single tag element</a> is created.
|
204 |
* <li>
|
205 |
* Otherwise, section <a target="_blank" href="http://www.w3.org/TR/REC-xml#CleanAttrVals">3.1</a>
|
206 |
* of the XML 1.0 specification states that a matching end tag MUST be present, and
|
207 |
* the parser searches for the start tag's matching end tag.
|
208 |
* <ul class="Unseparated">
|
209 |
* <li>
|
210 |
* If the matching end tag is found, an <a href="#ExplicitlyTerminated">explicitly terminated element</a> is created.
|
211 |
* <li>
|
212 |
* If no matching end tag is found, the source document is not valid XML and the incident is
|
213 |
* {@linkplain Source#getLogger() logged} as a missing required end tag.
|
214 |
* In this situation a <a href="#SingleTag">single tag element</a> is created.
|
215 |
* </ul>
|
216 |
* </ul>
|
217 |
* </ul>
|
218 |
* <li>
|
219 |
* If the start tag has any {@linkplain StartTag#getStartTagType() type} other than {@link StartTagType#NORMAL}:
|
220 |
* <ul>
|
221 |
* <li>
|
222 |
* If the start tag's type does not define a {@linkplain StartTagType#getCorrespondingEndTagType() corresponding end tag type},
|
223 |
* the parser does not conduct any search for an end tag and a <a href="#SingleTag">single tag element</a> is created.
|
224 |
* <li>
|
225 |
* If the start tag's type does define a {@linkplain StartTagType#getCorrespondingEndTagType() corresponding end tag type},
|
226 |
* the parser assumes that a matching end tag is required and searches for it.
|
227 |
* <ul class="Unseparated">
|
228 |
* <li>
|
229 |
* If the matching end tag is found, an <a href="#ExplicitlyTerminated">explicitly terminated element</a> is created.
|
230 |
* <li>
|
231 |
* If no matching end tag is found, the missing required end tag is {@linkplain Source#getLogger() logged}
|
232 |
* and a <a href="#SingleTag">single tag element</a> is created.
|
233 |
* </ul>
|
234 |
* </ul>
|
235 |
* </ul>
|
236 |
* @see HTMLElements
|
237 |
*/
|
238 |
public final class Element extends Segment {
|
239 |
private final StartTag startTag;
|
240 |
private final EndTag endTag;
|
241 |
private Segment content=null;
|
242 |
Element parentElement=Element.NOT_CACHED;
|
243 |
private int depth=-1;
|
244 |
private List<Element> childElements=null;
|
245 |
|
246 |
static final Element NOT_CACHED=new Element();
|
247 |
|
248 |
private static final boolean INCLUDE_INCORRECTLY_NESTED_CHILDREN_IN_HIERARCHY=true;
|
249 |
|
250 |
Element(final Source source, final StartTag startTag, final EndTag endTag) {
|
251 |
super(source, startTag.begin, endTag==null ? startTag.end : endTag.end);
|
252 |
if (source.isStreamed()) throw new UnsupportedOperationException("Elements are not supported when using StreamedSource");
|
253 |
this.startTag=startTag;
|
254 |
this.endTag=(endTag==null || endTag.length()==0) ? null : endTag;
|
255 |
}
|
256 |
|
257 |
// used only to construct NOT_CACHED
|
258 |
private Element() {
|
259 |
startTag=null;
|
260 |
endTag=null;
|
261 |
}
|
262 |
|
263 |
/**
|
264 |
* Returns the parent of this element in the document element hierarchy.
|
265 |
* <p>
|
266 |
* The {@link Source#fullSequentialParse()} method must be called (either explicitly or implicitly) immediately after construction of the <code>Source</code> object if this method is to be used.
|
267 |
* An <code>IllegalStateException</code> is thrown if a full sequential parse has not been performed or if it was performed after this element was found.
|
268 |
* <p>
|
269 |
* This method returns <code>null</code> for a <a href="Source.html#TopLevelElement">top-level element</a>,
|
270 |
* as well as any element formed from a {@linkplain TagType#isServerTag() server tag}, regardless of whether it is nested inside a normal element.
|
271 |
* <p>
|
272 |
* See the {@link Source#getChildElements()} method for more details.
|
273 |
*
|
274 |
* @return the parent of this element in the document element hierarchy, or <code>null</code> if this element is a <a href="Source.html#TopLevelElement">top-level element</a>.
|
275 |
* @throws IllegalStateException if a {@linkplain Source#fullSequentialParse() full sequential parse} has not been performed or if it was performed after this element was found.
|
276 |
* @see #getChildElements()
|
277 |
*/
|
278 |
public Element getParentElement() {
|
279 |
if (parentElement==Element.NOT_CACHED) {
|
280 |
if (!source.wasFullSequentialParseCalled()) throw new IllegalStateException("This operation is only possible after a full sequential parse has been performed");
|
281 |
if (startTag.isOrphaned()) throw new IllegalStateException("This operation is only possible if a full sequential parse was performed immediately after construction of the Source object");
|
282 |
source.getChildElements();
|
283 |
if (parentElement==Element.NOT_CACHED) parentElement=null;
|
284 |
}
|
285 |
return parentElement;
|
286 |
}
|
287 |
|
288 |
/**
|
289 |
* Returns a list of the immediate children of this element in the document element hierarchy.
|
290 |
* <p>
|
291 |
* The objects in the list are all of type {@link Element}.
|
292 |
* <p>
|
293 |
* See the {@link Source#getChildElements()} method for more details.
|
294 |
*
|
295 |
* @return a list of the immediate children of this element in the document element hierarchy, guaranteed not <code>null</code>.
|
296 |
* @see #getParentElement()
|
297 |
*/
|
298 |
@Override public final List<Element> getChildElements() {
|
299 |
return childElements!=null ? childElements : getChildElements(-1);
|
300 |
}
|
301 |
|
302 |
final List<Element> getChildElements(int depth) {
|
303 |
if (depth!=-1) this.depth=depth;
|
304 |
if (childElements==null) {
|
305 |
if (!Config.IncludeServerTagsInElementHierarchy && end==startTag.end) {
|
306 |
childElements=Collections.emptyList();
|
307 |
} else {
|
308 |
final int childDepth=(depth==-1 ? -1 : depth+1);
|
309 |
childElements=new ArrayList<Element>();
|
310 |
int pos=Config.IncludeServerTagsInElementHierarchy ? begin+1 : startTag.end;
|
311 |
final int maxChildBegin=(Config.IncludeServerTagsInElementHierarchy || endTag==null) ? end : endTag.begin;
|
312 |
while (true) {
|
313 |
final StartTag childStartTag=source.getNextStartTag(pos);
|
314 |
if (childStartTag==null || childStartTag.begin>=maxChildBegin) break;
|
315 |
if (Config.IncludeServerTagsInElementHierarchy) {
|
316 |
if (childStartTag.begin<startTag.end && !childStartTag.getTagType().isServerTag() && !startTag.getTagType().isServerTag()) {
|
317 |
// A start tag is found within another start tag, but neither is a server tag.
|
318 |
// This only legitimately happens in very rare cases like entity definitions in doctype.
|
319 |
// We don't want to include the child elements in the hierarchy.
|
320 |
pos=childStartTag.end;
|
321 |
continue;
|
322 |
}
|
323 |
} else if (childStartTag.getTagType().isServerTag()) {
|
324 |
pos=childStartTag.end;
|
325 |
continue;
|
326 |
}
|
327 |
final Element childElement=childStartTag.getElement();
|
328 |
if (childElement.end>end) {
|
329 |
if (source.logger.isInfoEnabled()) source.logger.info("Child "+childElement.getDebugInfo()+" extends beyond end of parent "+getDebugInfo());
|
330 |
if (!INCLUDE_INCORRECTLY_NESTED_CHILDREN_IN_HIERARCHY) {
|
331 |
pos=childElement.end;
|
332 |
continue;
|
333 |
}
|
334 |
}
|
335 |
childElement.getChildElements(childDepth);
|
336 |
if (childElement.parentElement==Element.NOT_CACHED) { // make sure element was not added as a child of a descendent element (can happen with overlapping elements)
|
337 |
childElement.parentElement=this;
|
338 |
childElements.add(childElement);
|
339 |
}
|
340 |
pos=childElement.end;
|
341 |
}
|
342 |
}
|
343 |
}
|
344 |
return childElements;
|
345 |
}
|
346 |
|
347 |
/**
|
348 |
* Returns the nesting depth of this element in the document element hierarchy.
|
349 |
* <p>
|
350 |
* The {@link Source#fullSequentialParse()} method must be called (either explicitly or implicitly) after construction of the <code>Source</code> object if this method is to be used.
|
351 |
* An <code>IllegalStateException</code> is thrown if a full sequential parse has not been performed or if it was performed after this element was found.
|
352 |
* <p>
|
353 |
* A <a href="Source.html#TopLevelElement">top-level element</a> has a nesting depth of <code>0</code>.
|
354 |
* <p>
|
355 |
* An element formed from a {@linkplain TagType#isServerTag() server tag} always have a nesting depth of <code>0</code>,
|
356 |
* regardless of whether it is nested inside a normal element.
|
357 |
* <p>
|
358 |
* See the {@link Source#getChildElements()} method for more details.
|
359 |
*
|
360 |
* @return the nesting depth of this element in the document element hierarchy.
|
361 |
* @throws IllegalStateException if a {@linkplain Source#fullSequentialParse() full sequential parse} has not been performed or if it was performed after this element was found.
|
362 |
* @see #getParentElement()
|
363 |
*/
|
364 |
public int getDepth() {
|
365 |
if (depth==-1) {
|
366 |
getParentElement();
|
367 |
if (depth==-1) depth=0;
|
368 |
}
|
369 |
return depth;
|
370 |
}
|
371 |
|
372 |
/**
|
373 |
* Returns the segment representing the <a target="_blank" href="http://www.w3.org/TR/REC-xml#dt-content">content</a> of the element.
|
374 |
* <p>
|
375 |
* This segment spans between the end of the start tag and the start of the end tag.
|
376 |
* If the end tag is not present, the content reaches to the end of the element.
|
377 |
* <p>
|
378 |
* A zero-length segment is returned if the element is {@linkplain #isEmpty() empty},
|
379 |
*
|
380 |
* @return the segment representing the content of the element, guaranteed not <code>null</code>.
|
381 |
*/
|
382 |
public Segment getContent() {
|
383 |
if (content==null) content=new Segment(source,startTag.end,getContentEnd());
|
384 |
return content;
|
385 |
}
|
386 |
|
387 |
/**
|
388 |
* Returns the start tag of the element.
|
389 |
* @return the start tag of the element.
|
390 |
*/
|
391 |
public StartTag getStartTag() {
|
392 |
return startTag;
|
393 |
}
|
394 |
|
395 |
/**
|
396 |
* Returns the end tag of the element.
|
397 |
* <p>
|
398 |
* If the element has no end tag this method returns <code>null</code>.
|
399 |
*
|
400 |
* @return the end tag of the element, or <code>null</code> if the element has no end tag.
|
401 |
*/
|
402 |
public EndTag getEndTag() {
|
403 |
return endTag;
|
404 |
}
|
405 |
|
406 |
/**
|
407 |
* Returns the {@linkplain StartTag#getName() name} of the {@linkplain #getStartTag() start tag} of this element, always in lower case.
|
408 |
* <p>
|
409 |
* This is equivalent to {@link #getStartTag()}<code>.</code>{@link StartTag#getName() getName()}.
|
410 |
* <p>
|
411 |
* See the {@link Tag#getName()} method for more information.
|
412 |
*
|
413 |
* @return the name of the {@linkplain #getStartTag() start tag} of this element, always in lower case.
|
414 |
*/
|
415 |
public String getName() {
|
416 |
return startTag.getName();
|
417 |
}
|
418 |
|
419 |
/**
|
420 |
* Indicates whether this element has zero-length {@linkplain #getContent() content}.
|
421 |
* <p>
|
422 |
* This is equivalent to {@link #getContent()}<code>.</code>{@link Segment#length() length()}<code>==0</code>.
|
423 |
* <p>
|
424 |
* Note that this is a broader definition than that of both the
|
425 |
* <a target="_blank" href="http://www.w3.org/TR/html401/intro/sgmltut.html#didx-element-4">HTML definition of an empty element</a>,
|
426 |
* which is only those elements whose end tag is {@linkplain HTMLElements#getEndTagForbiddenElementNames() forbidden}, and the
|
427 |
* <a target="_blank" href="http://www.w3.org/TR/REC-xml#dt-empty">XML definition of an empty element</a>,
|
428 |
* which is "either a start-tag immediately followed by an end-tag, or an {@linkplain #isEmptyElementTag() empty-element tag}".
|
429 |
* The other possibility covered by this property is the case of an <a href="HTMLElements.html#HTMLElement">HTML element</a> with an
|
430 |
* {@linkplain HTMLElements#getEndTagOptionalElementNames() optional} end tag that is immediately followed by another tag that implicitly
|
431 |
* terminates the element.
|
432 |
*
|
433 |
* @return <code>true</code> if this element has zero-length {@linkplain #getContent() content}, otherwise <code>false</code>.
|
434 |
* @see #isEmptyElementTag()
|
435 |
*/
|
436 |
public boolean isEmpty() {
|
437 |
return startTag.end==getContentEnd();
|
438 |
}
|
439 |
|
440 |
/**
|
441 |
* Indicates whether this element is an <a target="_blank" href="http://www.w3.org/TR/REC-xml#dt-eetag">empty-element tag</a>.
|
442 |
* <p>
|
443 |
* This is equivalent to {@link #getStartTag()}<code>.</code>{@link StartTag#isEmptyElementTag() isEmptyElementTag()}.
|
444 |
*
|
445 |
* @return <code>true</code> if this element is an <a target="_blank" href="http://www.w3.org/TR/REC-xml#dt-eetag">empty-element tag</a>, otherwise <code>false</code>.
|
446 |
*/
|
447 |
public boolean isEmptyElementTag() {
|
448 |
return startTag.isEmptyElementTag();
|
449 |
}
|
450 |
|
451 |
/**
|
452 |
* Returns the attributes specified in this element's start tag.
|
453 |
* <p>
|
454 |
* This is equivalent to {@link #getStartTag()}<code>.</code>{@link StartTag#getAttributes() getAttributes()}.
|
455 |
*
|
456 |
* @return the attributes specified in this element's start tag.
|
457 |
* @see StartTag#getAttributes()
|
458 |
*/
|
459 |
public Attributes getAttributes() {
|
460 |
return getStartTag().getAttributes();
|
461 |
}
|
462 |
|
463 |
/**
|
464 |
* Returns the {@linkplain CharacterReference#decode(CharSequence) decoded} value of the attribute with the specified name (case insensitive).
|
465 |
* <p>
|
466 |
* Returns <code>null</code> if the {@linkplain #getStartTag() start tag of this element} does not
|
467 |
* {@linkplain StartTagType#hasAttributes() have attributes},
|
468 |
* no attribute with the specified name exists or the attribute {@linkplain Attribute#hasValue() has no value}.
|
469 |
* <p>
|
470 |
* This is equivalent to {@link #getStartTag()}<code>.</code>{@link StartTag#getAttributeValue(String) getAttributeValue(attributeName)}.
|
471 |
*
|
472 |
* @param attributeName the name of the attribute to get.
|
473 |
* @return the {@linkplain CharacterReference#decode(CharSequence) decoded} value of the attribute with the specified name, or <code>null</code> if the attribute does not exist or {@linkplain Attribute#hasValue() has no value}.
|
474 |
*/
|
475 |
public String getAttributeValue(final String attributeName) {
|
476 |
return getStartTag().getAttributeValue(attributeName);
|
477 |
}
|
478 |
|
479 |
/**
|
480 |
* Returns the {@link FormControl} defined by this element.
|
481 |
* @return the {@link FormControl} defined by this element, or <code>null</code> if it is not a <a target="_blank" href="http://www.w3.org/TR/html401/interact/forms.html#form-controls">control</a>.
|
482 |
*/
|
483 |
public FormControl getFormControl() {
|
484 |
return FormControl.construct(this);
|
485 |
}
|
486 |
|
487 |
public String getDebugInfo() {
|
488 |
if (this==NOT_CACHED) return "NOT_CACHED";
|
489 |
final StringBuilder sb=new StringBuilder();
|
490 |
sb.append("Element ");
|
491 |
startTag.appendDebugTag(sb);
|
492 |
if (!isEmpty()) sb.append('-');
|
493 |
if (endTag!=null) sb.append(endTag);
|
494 |
sb.append(' ');
|
495 |
startTag.appendDebugTagType(sb);
|
496 |
sb.append(super.getDebugInfo());
|
497 |
return sb.toString();
|
498 |
}
|
499 |
|
500 |
int getContentEnd() {
|
501 |
return endTag!=null ? endTag.begin : end;
|
502 |
}
|
503 |
}
|