1 |
// Jericho HTML Parser - Java based library for analysing and manipulating HTML
|
2 |
// Version 3.2
|
3 |
// Copyright (C) 2004-2009 Martin Jericho
|
4 |
// http://jericho.htmlparser.net/
|
5 |
//
|
6 |
// This library is free software; you can redistribute it and/or
|
7 |
// modify it under the terms of either one of the following licences:
|
8 |
//
|
9 |
// 1. The Eclipse Public License (EPL) version 1.0,
|
10 |
// included in this distribution in the file licence-epl-1.0.html
|
11 |
// or available at http://www.eclipse.org/legal/epl-v10.html
|
12 |
//
|
13 |
// 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
|
14 |
// included in this distribution in the file licence-lgpl-2.1.txt
|
15 |
// or available at http://www.gnu.org/licenses/lgpl.txt
|
16 |
//
|
17 |
// This library is distributed on an "AS IS" basis,
|
18 |
// WITHOUT WARRANTY OF ANY KIND, either express or implied.
|
19 |
// See the individual licence texts for more details.
|
20 |
|
21 |
package net.htmlparser.jericho;
|
22 |
|
23 |
import java.util.*;
|
24 |
|
25 |
/**
|
26 |
* Represents the <a target="_blank" href="http://www.w3.org/TR/html401/intro/sgmltut.html#didx-element-3">end tag</a> of an
|
27 |
* {@linkplain Element element} in a specific {@linkplain Source source} document.
|
28 |
* <p>
|
29 |
* An end tag always has a {@linkplain #getTagType() type} that is a subclass of {@link EndTagType}, meaning it
|
30 |
* always starts with the characters '<code></</code>'.
|
31 |
* <p>
|
32 |
* <code>EndTag</code> instances are obtained using one of the following methods:
|
33 |
* <ul>
|
34 |
* <li>{@link Element#getEndTag()}
|
35 |
* <li>{@link Tag#getNextTag()}
|
36 |
* <li>{@link Tag#getPreviousTag()}
|
37 |
* <li>{@link Source#getPreviousEndTag(int pos)}
|
38 |
* <li>{@link Source#getPreviousEndTag(int pos, String name)}
|
39 |
* <li>{@link Source#getPreviousTag(int pos)}
|
40 |
* <li>{@link Source#getPreviousTag(int pos, TagType)}
|
41 |
* <li>{@link Source#getNextEndTag(int pos)}
|
42 |
* <li>{@link Source#getNextEndTag(int pos, String name)}
|
43 |
* <li>{@link Source#getNextEndTag(int pos, String name, EndTagType)}
|
44 |
* <li>{@link Source#getNextTag(int pos)}
|
45 |
* <li>{@link Source#getNextTag(int pos, TagType)}
|
46 |
* <li>{@link Source#getEnclosingTag(int pos)}
|
47 |
* <li>{@link Source#getEnclosingTag(int pos, TagType)}
|
48 |
* <li>{@link Source#getTagAt(int pos)}
|
49 |
* <li>{@link Segment#getAllTags()}
|
50 |
* <li>{@link Segment#getAllTags(TagType)}
|
51 |
* </ul>
|
52 |
* <p>
|
53 |
* The {@link Tag} superclass defines the {@link Tag#getName() getName()} method used to get the name of this end tag.
|
54 |
* <p>
|
55 |
* See also the XML 1.0 specification for <a target="_blank" href="http://www.w3.org/TR/REC-xml#dt-etag">end tags</a>.
|
56 |
*
|
57 |
* @see Tag
|
58 |
* @see StartTag
|
59 |
* @see Element
|
60 |
*/
|
61 |
public final class EndTag extends Tag {
|
62 |
private final EndTagType endTagType;
|
63 |
|
64 |
/**
|
65 |
* Constructs a new <code>EndTag</code>.
|
66 |
*
|
67 |
* @param source the {@link Source} document.
|
68 |
* @param begin the character position in the source document where this tag {@linkplain Segment#getBegin() begins}.
|
69 |
* @param end the character position in the source document where this tag {@linkplain Segment#getEnd() ends}.
|
70 |
* @param endTagType the {@linkplain #getEndTagType() type} of the end tag.
|
71 |
* @param name the {@linkplain Tag#getName() name} of the tag.
|
72 |
*/
|
73 |
EndTag(final Source source, final int begin, final int end, final EndTagType endTagType, final String name) {
|
74 |
super(source,begin,end,name);
|
75 |
this.endTagType=endTagType;
|
76 |
}
|
77 |
|
78 |
/**
|
79 |
* Returns the {@linkplain Element element} that is ended by this end tag.
|
80 |
* <p>
|
81 |
* Returns <code>null</code> if this end tag is not properly matched to any {@linkplain StartTag start tag} in the source document.
|
82 |
* <p>
|
83 |
* This method is much less efficient than the {@link StartTag#getElement()} method.
|
84 |
* <p>
|
85 |
* IMPLEMENTATION NOTE: The explanation for why this method is relatively inefficient lies in the fact that more than one
|
86 |
* {@linkplain StartTagType start tag type} can have the same
|
87 |
* {@linkplain StartTagType#getCorrespondingEndTagType() corresponding end tag type}, so it is not possible to know for certain
|
88 |
* which type of start tag this end tag is matched to (see {@link EndTagType#getCorrespondingStartTagType()} for more explanation).
|
89 |
* Because of this uncertainty, the implementation of this method must check every start tag preceding this end tag, calling its
|
90 |
* {@link StartTag#getElement()} method to see whether it is terminated by this end tag.
|
91 |
*
|
92 |
* @return the {@linkplain Element element} that is ended by this end tag.
|
93 |
*/
|
94 |
public Element getElement() {
|
95 |
if (element!=Element.NOT_CACHED) return element;
|
96 |
int pos=begin;
|
97 |
while (pos!=0) {
|
98 |
StartTag startTag=source.getPreviousStartTag(pos-1);
|
99 |
if (startTag==null) break;
|
100 |
Element foundElement=startTag.getElement(); // this automatically sets foundElement.getEndTag().element cache
|
101 |
if (foundElement.getEndTag()==this) return foundElement; // no need to set element as it was already done in previous statement
|
102 |
pos=startTag.begin;
|
103 |
}
|
104 |
return element=null;
|
105 |
}
|
106 |
|
107 |
/**
|
108 |
* Returns the {@linkplain EndTagType type} of this end tag.
|
109 |
* <p>
|
110 |
* This is equivalent to <code>(EndTagType)</code>{@link #getTagType()}.
|
111 |
*
|
112 |
* @return the {@linkplain EndTagType type} of this end tag.
|
113 |
*/
|
114 |
public EndTagType getEndTagType() {
|
115 |
return endTagType;
|
116 |
}
|
117 |
|
118 |
// Documentation inherited from Tag
|
119 |
public TagType getTagType() {
|
120 |
return endTagType;
|
121 |
}
|
122 |
|
123 |
// Documentation inherited from Tag
|
124 |
public boolean isUnregistered() {
|
125 |
return endTagType==EndTagType.UNREGISTERED;
|
126 |
}
|
127 |
|
128 |
/**
|
129 |
* Returns an XML representation of this end tag.
|
130 |
* <p>
|
131 |
* This method is included for symmetry with the {@link StartTag#tidy()} method and simply
|
132 |
* returns the {@linkplain Segment#toString() source text} of the tag.
|
133 |
*
|
134 |
* @return an XML representation of this end tag.
|
135 |
*/
|
136 |
public String tidy() {
|
137 |
return toString();
|
138 |
}
|
139 |
|
140 |
/**
|
141 |
* Generates the HTML text of a {@linkplain EndTagType#NORMAL normal} end tag with the specified tag {@linkplain #getName() name}.
|
142 |
* <p>
|
143 |
* <dl>
|
144 |
* <dt>Example:</dt>
|
145 |
* <dd>
|
146 |
* <p>
|
147 |
* The following method call:
|
148 |
* <blockquote class="code">
|
149 |
* <code>EndTag.generateHTML("INPUT")</code>
|
150 |
* </blockquote>
|
151 |
* returns the following output:
|
152 |
* <blockquote class="code">
|
153 |
* <code></INPUT></code>
|
154 |
* </blockquote>
|
155 |
* </dd>
|
156 |
* </dl>
|
157 |
*
|
158 |
* @param tagName the {@linkplain #getName() name} of the end tag.
|
159 |
* @return the HTML text of a {@linkplain EndTagType#NORMAL normal} end tag with the specified tag {@linkplain #getName() name}.
|
160 |
* @see StartTag#generateHTML(String tagName, Map attributesMap, boolean emptyElementTag)
|
161 |
*/
|
162 |
public static String generateHTML(final String tagName) {
|
163 |
return EndTagType.NORMAL.generateHTML(tagName);
|
164 |
}
|
165 |
|
166 |
public String getDebugInfo() {
|
167 |
final StringBuilder sb=new StringBuilder();
|
168 |
sb.append(this).append(' ');
|
169 |
if (endTagType!=EndTagType.NORMAL) sb.append('(').append(endTagType.getDescription()).append(") ");
|
170 |
sb.append(super.getDebugInfo());
|
171 |
return sb.toString();
|
172 |
}
|
173 |
|
174 |
/**
|
175 |
* Returns the previous end tag matching the specified {@linkplain #getName() name} and {@linkplain EndTagType type}, starting at the specified position.
|
176 |
* <p>
|
177 |
* Called from {@link Source#getPreviousEndTag(int pos, String name)}.
|
178 |
*
|
179 |
* @param source the {@link Source} document.
|
180 |
* @param pos the position to search from.
|
181 |
* @param name the {@linkplain #getName() name} of the tag including its {@linkplain TagType#getNamePrefix() prefix} (must be lower case, may be null).
|
182 |
* @param endTagType the {@linkplain EndTagType type} of end tag to search for.
|
183 |
* @return the previous end tag matching the specified {@linkplain #getName() name} and {@linkplain EndTagType type}, starting at the specified position, or null if none is found.
|
184 |
*/
|
185 |
static EndTag getPrevious(final Source source, final int pos, final String name, final EndTagType endTagType) {
|
186 |
if (name==null) return (EndTag)Tag.getPreviousTag(source,pos,endTagType);
|
187 |
if (name.length()==0) throw new IllegalArgumentException("name argument must not be zero length");
|
188 |
final String searchString=endTagType.START_DELIMITER_PREFIX+name;
|
189 |
try {
|
190 |
final ParseText parseText=source.getParseText();
|
191 |
int begin=pos;
|
192 |
do {
|
193 |
begin=parseText.lastIndexOf(searchString,begin);
|
194 |
if (begin==-1) return null;
|
195 |
final EndTag endTag=(EndTag)source.getTagAt(begin);
|
196 |
if (endTag!=null && endTag.getEndTagType()==endTagType && name.equals(endTag.getName())) return endTag;
|
197 |
} while ((begin-=1)>=0);
|
198 |
} catch (IndexOutOfBoundsException ex) {
|
199 |
// this should never happen during a get previous operation so rethrow it:
|
200 |
throw ex;
|
201 |
}
|
202 |
return null;
|
203 |
}
|
204 |
|
205 |
/**
|
206 |
* Returns the next end tag matching the specified {@linkplain #getName() name} and {@linkplain EndTagType type}, starting at the specified position.
|
207 |
* <p>
|
208 |
* Called from {@link Source#getNextEndTag(int pos, String name, EndTagType endTagType)}.
|
209 |
*
|
210 |
* @param source the {@link Source} document.
|
211 |
* @param pos the position to search from.
|
212 |
* @param name the {@linkplain #getName() name} of the tag including its {@linkplain TagType#getNamePrefix() prefix} (must be lower case, may be null).
|
213 |
* @param endTagType the {@linkplain EndTagType type} of end tag to search for.
|
214 |
* @return the next end tag matching the specified {@linkplain #getName() name} and {@linkplain EndTagType type}, starting at the specified position, or null if none is found.
|
215 |
*/
|
216 |
static EndTag getNext(final Source source, final int pos, final String name, final EndTagType endTagType) {
|
217 |
if (name==null) return (EndTag)Tag.getNextTag(source,pos,endTagType);
|
218 |
if (name.length()==0) throw new IllegalArgumentException("name argument must not be zero length");
|
219 |
final String searchString=endTagType.START_DELIMITER_PREFIX+name;
|
220 |
try {
|
221 |
final ParseText parseText=source.getParseText();
|
222 |
int begin=pos;
|
223 |
do {
|
224 |
begin=parseText.indexOf(searchString,begin);
|
225 |
if (begin==-1) return null;
|
226 |
final EndTag endTag=(EndTag)source.getTagAt(begin);
|
227 |
if (endTag!=null && endTag.getEndTagType()==endTagType && name.equals(endTag.getName())) return endTag;
|
228 |
} while ((begin+=1)<source.end);
|
229 |
} catch (IndexOutOfBoundsException ex) {
|
230 |
// this should only happen when the end of file is reached in the middle of a tag.
|
231 |
// we don't have to do anything to handle it as there will be no more tags anyway.
|
232 |
}
|
233 |
return null;
|
234 |
}
|
235 |
|
236 |
static EndTag getPrevious(final Source source, int pos) {
|
237 |
while (true) {
|
238 |
final Tag tag=Tag.getPreviousTag(source,pos);
|
239 |
if (tag==null) return null;
|
240 |
if (tag instanceof EndTag) return (EndTag)tag;
|
241 |
pos-=1;
|
242 |
}
|
243 |
}
|
244 |
|
245 |
static EndTag getNext(final Source source, int pos) {
|
246 |
while (true) {
|
247 |
final Tag tag=Tag.getNextTag(source,pos);
|
248 |
if (tag==null) return null;
|
249 |
if (tag instanceof EndTag) return (EndTag)tag;
|
250 |
pos+=1;
|
251 |
}
|
252 |
}
|
253 |
}
|
254 |
|