/[aagtl_public1]/src/net/htmlparser/jericho/Segment.java
aagtl

Diff of /src/net/htmlparser/jericho/Segment.java

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

Revision 3 Revision 4
18// WITHOUT WARRANTY OF ANY KIND, either express or implied. 18// WITHOUT WARRANTY OF ANY KIND, either express or implied.
19// See the individual licence texts for more details. 19// See the individual licence texts for more details.
20 20
21package net.htmlparser.jericho; 21package net.htmlparser.jericho;
22 22
23import java.io.Writer;
24import java.util.ArrayList;
25import java.util.Collection;
26import java.util.Collections;
23import java.util.Iterator; 27import java.util.Iterator;
24import java.util.List; 28import java.util.List;
25import java.util.Collections;
26import java.util.ArrayList;
27import java.util.regex.Pattern; 29import java.util.regex.Pattern;
28 30
29/** 31/**
30 * Represents a segment of a {@link Source} document. 32 * Represents a segment of a {@link Source} document.
31 * <p> 33 * <p>
32 * Many of the <a href="Tag.html#TagSearchMethods">tag search methods</a> are defined in this class. 34 * Many of the <a href="Tag.html#TagSearchMethods">tag search methods</a> are defined in this class.
33 * <p> 35 * <p>
34 * The <i>span</i> of a segment is defined by the combination of its begin and end character positions. 36 * The <i>span</i> of a segment is defined by the combination of its begin and end character positions.
35 */ 37 */
36public class Segment implements Comparable<Segment>, CharSequence { 38public class Segment implements Comparable<Segment>, CharSequence
39{
37 final int begin; 40 final int begin;
38 final int end; 41 final int end;
39 final Source source; 42 final Source source;
40 43
41 private static final char[] WHITESPACE={' ','\n','\r','\t','\f','\u200B'}; // see comments in isWhiteSpace(char) method 44 private static final char[] WHITESPACE = { ' ', '\n', '\r', '\t', '\f', '\u200B' }; // see comments in isWhiteSpace(char) method
42 45
43 /** 46 /**
44 * Constructs a new <code>Segment</code> within the specified {@linkplain Source source} document with the specified begin and end character positions. 47 * Constructs a new <code>Segment</code> within the specified {@linkplain Source source} document with the specified begin and end character positions.
48 *
49 * @param source
45 * @param source the {@link Source} document, must not be <code>null</code>. 50 * the {@link Source} document, must not be <code>null</code>.
51 * @param begin
46 * @param begin the character position in the source where this segment {@linkplain #getBegin() begins}, inclusive. 52 * the character position in the source where this segment {@linkplain #getBegin() begins}, inclusive.
53 * @param end
47 * @param end the character position in the source where this segment {@linkplain #getEnd() ends}, exclusive. 54 * the character position in the source where this segment {@linkplain #getEnd() ends}, exclusive.
48 */ 55 */
49 public Segment(final Source source, final int begin, final int end) { 56 public Segment(final Source source, final int begin, final int end)
57 {
50 if (begin==-1 || end==-1 || begin>end) throw new IllegalArgumentException(); 58 if (begin == -1 || end == -1 || begin > end) throw new IllegalArgumentException();
51 this.begin=begin; 59 this.begin = begin;
52 this.end=end; 60 this.end = end;
53 if (source==null) throw new IllegalArgumentException("source argument must not be null"); 61 if (source == null) throw new IllegalArgumentException("source argument must not be null");
54 this.source=source; 62 this.source = source;
55 } 63 }
56 64
57 // Only called from Source constructor 65 // Only called from Source constructor
58 Segment(final int length) { 66 Segment(final int length)
67 {
59 begin=0; 68 begin = 0;
60 this.end=length; 69 this.end = length;
61 source=(Source)this; 70 source = (Source) this;
62 } 71 }
63 72
64 // Only used for creating dummy flag instances of this type (see Tag.NOT_CACHED and Element.NOT_CACHED) 73 // Only used for creating dummy flag instances of this type (see Tag.NOT_CACHED and Element.NOT_CACHED)
65 Segment() { 74 Segment()
75 {
66 this(0,0); 76 this(0, 0);
67 } 77 }
68 78
69 // Only used for creating dummy flag instances of this type (see Segment() constructor and StreamedSource.START_SEGMENT) 79 // Only used for creating dummy flag instances of this type (see Segment() constructor and StreamedSource.START_SEGMENT)
70 Segment(final int begin, final int end) { 80 Segment(final int begin, final int end)
81 {
71 this.begin=begin; 82 this.begin = begin;
72 this.end=end; 83 this.end = end;
73 source=null; 84 source = null;
74 } 85 }
75 86
76 /** 87 /**
77 * Returns the {@link Source} document containing this segment. 88 * Returns the {@link Source} document containing this segment.
78 * <p> 89 * <p>
79 * If a {@link StreamedSource} is in use, this method throws an <code>UnsupportedOperationException</code>. 90 * If a {@link StreamedSource} is in use, this method throws an <code>UnsupportedOperationException</code>.
80 * 91 *
81 * @return the {@link Source} document containing this segment. 92 * @return the {@link Source} document containing this segment.
82 */ 93 */
83 public final Source getSource() { 94 public final Source getSource()
95 {
84 if (source.isStreamed()) throw new UnsupportedOperationException("Source object is not available when using StreamedSource"); 96 if (source.isStreamed()) throw new UnsupportedOperationException("Source object is not available when using StreamedSource");
85 return source; 97 return source;
86 } 98 }
87 99
88 /** 100 /**
89 * Returns the character position in the {@link Source} document at which this segment begins, inclusive. 101 * Returns the character position in the {@link Source} document at which this segment begins, inclusive.
90 * <p> 102 * <p>
91 * Use the {@link Source#getRowColumnVector(int pos)} method to determine the row and column numbers corresponding to this character position. 103 * Use the {@link Source#getRowColumnVector(int pos)} method to determine the row and column numbers corresponding to this character position.
92 * 104 *
93 * @return the character position in the {@link Source} document at which this segment begins, inclusive. 105 * @return the character position in the {@link Source} document at which this segment begins, inclusive.
94 */ 106 */
95 public final int getBegin() { 107 public final int getBegin()
108 {
96 return begin; 109 return begin;
97 } 110 }
98 111
99 /** 112 /**
100 * Returns the character position in the {@link Source} document immediately after the end of this segment. 113 * Returns the character position in the {@link Source} document immediately after the end of this segment.
101 * <p> 114 * <p>
102 * The character at the position specified by this property is <b>not</b> included in the segment. 115 * The character at the position specified by this property is <b>not</b> included in the segment.
103 * 116 *
104 * @return the character position in the {@link Source} document immediately after the end of this segment. 117 * @return the character position in the {@link Source} document immediately after the end of this segment.
105 * @see #getBegin() 118 * @see #getBegin()
106 */ 119 */
107 public final int getEnd() { 120 public final int getEnd()
121 {
108 return end; 122 return end;
109 } 123 }
110 124
111 /** 125 /**
112 * Compares the specified object with this <code>Segment</code> for equality. 126 * Compares the specified object with this <code>Segment</code> for equality.
113 * <p> 127 * <p>
114 * Returns <code>true</code> if and only if the specified object is also a <code>Segment</code>, 128 * Returns <code>true</code> if and only if the specified object is also a <code>Segment</code>, and both segments have the same {@link Source}, and the same begin and end positions.
115 * and both segments have the same {@link Source}, and the same begin and end positions. 129 *
130 * @param object
116 * @param object the object to be compared for equality with this <code>Segment</code>. 131 * the object to be compared for equality with this <code>Segment</code>.
117 * @return <code>true</code> if the specified object is equal to this <code>Segment</code>, otherwise <code>false</code>. 132 * @return <code>true</code> if the specified object is equal to this <code>Segment</code>, otherwise <code>false</code>.
118 */ 133 */
119 public final boolean equals(final Object object) { 134 public final boolean equals(final Object object)
135 {
120 if (this==object) return true; 136 if (this == object) return true;
121 if (object==null || !(object instanceof Segment)) return false; 137 if (object == null || !(object instanceof Segment)) return false;
122 final Segment segment=(Segment)object; 138 final Segment segment = (Segment) object;
123 return segment.begin==begin && segment.end==end && segment.source==source; 139 return segment.begin == begin && segment.end == end && segment.source == source;
124 } 140 }
125 141
126 /** 142 /**
127 * Returns a hash code value for the segment. 143 * Returns a hash code value for the segment.
128 * <p> 144 * <p>
129 * The current implementation returns the sum of the begin and end positions, although this is not 145 * The current implementation returns the sum of the begin and end positions, although this is not guaranteed in future versions.
130 * guaranteed in future versions.
131 * 146 *
132 * @return a hash code value for the segment. 147 * @return a hash code value for the segment.
133 */ 148 */
134 public int hashCode() { 149 public int hashCode()
150 {
135 return begin+end; 151 return begin + end;
136 } 152 }
137 153
138 /** 154 /**
139 * Returns the length of the segment. 155 * Returns the length of the segment.
140 * This is defined as the number of characters between the begin and end positions. 156 * This is defined as the number of characters between the begin and end positions.
157 *
141 * @return the length of the segment. 158 * @return the length of the segment.
142 */ 159 */
143 public int length() { 160 public int length()
161 {
144 return end-begin; 162 return end - begin;
145 } 163 }
146 164
147 /** 165 /**
148 * Indicates whether this <code>Segment</code> encloses the specified <code>Segment</code>. 166 * Indicates whether this <code>Segment</code> encloses the specified <code>Segment</code>.
149 * <p> 167 * <p>
150 * This is the case if {@link #getBegin()}<code>&lt;=segment.</code>{@link #getBegin()}<code> &amp;&amp; </code>{@link #getEnd()}<code>&gt;=segment.</code>{@link #getEnd()}. 168 * This is the case if {@link #getBegin()}<code>&lt;=segment.</code>{@link #getBegin()}<code> &amp;&amp; </code>{@link #getEnd()}<code>&gt;=segment.</code>{@link #getEnd()}.
151 * <p> 169 * <p>
152 * Note that a segment encloses itself. 170 * Note that a segment encloses itself.
153 * 171 *
172 * @param segment
154 * @param segment the segment to be tested for being enclosed by this segment. 173 * the segment to be tested for being enclosed by this segment.
155 * @return <code>true</code> if this <code>Segment</code> encloses the specified <code>Segment</code>, otherwise <code>false</code>. 174 * @return <code>true</code> if this <code>Segment</code> encloses the specified <code>Segment</code>, otherwise <code>false</code>.
156 */ 175 */
157 public final boolean encloses(final Segment segment) { 176 public final boolean encloses(final Segment segment)
177 {
158 return begin<=segment.begin && end>=segment.end; 178 return begin <= segment.begin && end >= segment.end;
159 } 179 }
160 180
161 /** 181 /**
162 * Indicates whether this segment encloses the specified character position in the source document. 182 * Indicates whether this segment encloses the specified character position in the source document.
163 * <p> 183 * <p>
164 * This is the case if {@link #getBegin()}<code> &lt;= pos &lt; </code>{@link #getEnd()}. 184 * This is the case if {@link #getBegin()}<code> &lt;= pos &lt; </code>{@link #getEnd()}.
165 * 185 *
186 * @param pos
166 * @param pos the position in the {@link Source} document. 187 * the position in the {@link Source} document.
167 * @return <code>true</code> if this segment encloses the specified character position in the source document, otherwise <code>false</code>. 188 * @return <code>true</code> if this segment encloses the specified character position in the source document, otherwise <code>false</code>.
168 */ 189 */
169 public final boolean encloses(final int pos) { 190 public final boolean encloses(final int pos)
191 {
170 return begin<=pos && pos<end; 192 return begin <= pos && pos < end;
171 } 193 }
172 194
173 /** 195 /**
174 * Returns the source text of this segment as a <code>String</code>. 196 * Returns the source text of this segment as a <code>String</code>.
175 * <p> 197 * <p>
176 * The returned <code>String</code> is newly created with every call to this method, unless this 198 * The returned <code>String</code> is newly created with every call to this method, unless this segment is itself an instance of {@link Source}.
177 * segment is itself an instance of {@link Source}.
178 * 199 *
179 * @return the source text of this segment as a <code>String</code>. 200 * @return the source text of this segment as a <code>String</code>.
180 */ 201 */
181 public String toString() { 202 public String toString()
203 {
182 return source.subSequence(begin,end).toString(); 204 return source.subSequence(begin, end).toString();
183 } 205 }
184 206
185 /** 207 /**
186 * Performs a simple rendering of the HTML markup in this segment into text. 208 * Performs a simple rendering of the HTML markup in this segment into text.
187 * <p> 209 * <p>
188 * The output can be configured by setting any number of properties on the returned {@link Renderer} instance before 210 * The output can be configured by setting any number of properties on the returned {@link Renderer} instance before {@linkplain Renderer#writeTo(Writer) obtaining its output}.
189 * {@linkplain Renderer#writeTo(Writer) obtaining its output}.
190 * 211 *
191 * @return an instance of {@link Renderer} based on this segment. 212 * @return an instance of {@link Renderer} based on this segment.
192 * @see #getTextExtractor() 213 * @see #getTextExtractor()
193 */ 214 */
194 public Renderer getRenderer() { 215 public Renderer getRenderer()
216 {
195 return new Renderer(this); 217 return new Renderer(this);
196 } 218 }
197 219
198 /** 220 /**
199 * Extracts the textual content from the HTML markup of this segment. 221 * Extracts the textual content from the HTML markup of this segment.
200 * <p> 222 * <p>
201 * The output can be configured by setting properties on the returned {@link TextExtractor} instance before 223 * The output can be configured by setting properties on the returned {@link TextExtractor} instance before {@linkplain TextExtractor#writeTo(Writer) obtaining its output}.
202 * {@linkplain TextExtractor#writeTo(Writer) obtaining its output}. 224 * <p>
203 * <p> 225 *
204 * @return an instance of {@link TextExtractor} based on this segment. 226 * @return an instance of {@link TextExtractor} based on this segment.
205 * @see #getRenderer() 227 * @see #getRenderer()
206 */ 228 */
207 public TextExtractor getTextExtractor() { 229 public TextExtractor getTextExtractor()
230 {
208 return new TextExtractor(this); 231 return new TextExtractor(this);
209 } 232 }
210 233
211 /** 234 /**
212 * Returns an iterator over every {@linkplain Tag tag}, {@linkplain CharacterReference character reference} and <a href="Source.html#PlainText">plain text</a> segment contained within this segment. 235 * Returns an iterator over every {@linkplain Tag tag}, {@linkplain CharacterReference character reference} and <a href="Source.html#PlainText">plain text</a> segment contained within this segment.
213 * <p> 236 * <p>
214 * See the {@link Source#iterator()} method for a detailed description. 237 * See the {@link Source#iterator()} method for a detailed description.
215 * <p> 238 * <p>
216 * <dl> 239 * <dl>
217 * <dt>Example:</dt> 240 * <dt>Example:</dt>
218 * <dd> 241 * <dd>
219 * <p> 242 * <p>
220 * The following code demonstrates the typical usage of this method to make an exact copy of this segment to <code>writer</code> (assuming no server tags are present): 243 * The following code demonstrates the typical usage of this method to make an exact copy of this segment to <code>writer</code> (assuming no server tags are present):
221 * </p> 244 * </p>
245 *
222 * <pre> 246 * <pre>
223 * for (Iterator&lt;Segment&gt; nodeIterator=segment.getNoteIterator(); nodeIterator.hasNext();) { 247 * for (Iterator&lt;Segment&gt; nodeIterator = segment.getNoteIterator(); nodeIterator.hasNext();)
248 * {
224 * Segment nodeSegment=nodeIterator.next(); 249 * Segment nodeSegment = nodeIterator.next();
225 * if (nodeSegment instanceof Tag) { 250 * if (nodeSegment instanceof Tag)
251 * {
226 * Tag tag=(Tag)nodeSegment; 252 * Tag tag = (Tag) nodeSegment;
227 * // HANDLE TAG 253 * // HANDLE TAG
228 * // Uncomment the following line to ensure each tag is valid XML: 254 * // Uncomment the following line to ensure each tag is valid XML:
229 * // writer.write(tag.tidy()); continue; 255 * // writer.write(tag.tidy()); continue;
256 * }
230 * } else if (nodeSegment instanceof CharacterReference) { 257 * else if (nodeSegment instanceof CharacterReference)
258 * {
231 * CharacterReference characterReference=(CharacterReference)nodeSegment; 259 * CharacterReference characterReference = (CharacterReference) nodeSegment;
232 * // HANDLE CHARACTER REFERENCE 260 * // HANDLE CHARACTER REFERENCE
233 * // Uncomment the following line to decode all character references instead of copying them verbatim: 261 * // Uncomment the following line to decode all character references instead of copying them verbatim:
234 * // characterReference.appendCharTo(writer); continue; 262 * // characterReference.appendCharTo(writer); continue;
235 * } else { 263 * }
264 * else
265 * {
236 * // HANDLE PLAIN TEXT 266 * // HANDLE PLAIN TEXT
237 * } 267 * }
238 * // unless specific handling has prevented getting to here, simply output the segment as is: 268 * // unless specific handling has prevented getting to here, simply output the segment as is:
239 * writer.write(nodeSegment.toString()); 269 * writer.write(nodeSegment.toString());
270 * }
240 * }</pre> 271 * </pre>
272 *
241 * </dd> 273 * </dd>
242 * </dl> 274 * </dl>
275 *
243 * @return an iterator over every {@linkplain Tag tag}, {@linkplain CharacterReference character reference} and <a href="Source.html#PlainText">plain text</a> segment contained within this segment. 276 * @return an iterator over every {@linkplain Tag tag}, {@linkplain CharacterReference character reference} and <a href="Source.html#PlainText">plain text</a> segment contained within this segment.
244 */ 277 */
245 public Iterator<Segment> getNodeIterator() { 278 public Iterator<Segment> getNodeIterator()
279 {
246 return new NodeIterator(this); 280 return new NodeIterator(this);
247 } 281 }
248 282
249 /** 283 /**
250 * Returns a list of all {@link Tag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment. 284 * Returns a list of all {@link Tag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
251 * <p> 285 * <p>
252 * The {@link Source#fullSequentialParse()} method should be called after construction of the {@link Source} object 286 * The {@link Source#fullSequentialParse()} method should be called after construction of the {@link Source} object if this method is to be used on a large proportion of the source. It is called automatically if this method is called on the {@link Source} object itself.
253 * if this method is to be used on a large proportion of the source.
254 * It is called automatically if this method is called on the {@link Source} object itself.
255 * <p> 287 * <p>
256 * See the {@link Tag} class documentation for more details about the behaviour of this method. 288 * See the {@link Tag} class documentation for more details about the behaviour of this method.
257 * 289 *
258 * @return a list of all {@link Tag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment. 290 * @return a list of all {@link Tag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
259 */ 291 */
260 public List<Tag> getAllTags() { 292 public List<Tag> getAllTags()
293 {
261 return getAllTags(null); 294 return getAllTags(null);
262 } 295 }
263 296
264 /** 297 /**
265 * Returns a list of all {@link Tag} objects of the specified {@linkplain TagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment. 298 * Returns a list of all {@link Tag} objects of the specified {@linkplain TagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
266 * <p> 299 * <p>
267 * See the {@link Tag} class documentation for more details about the behaviour of this method. 300 * See the {@link Tag} class documentation for more details about the behaviour of this method.
268 * <p> 301 * <p>
269 * Specifying a <code>null</code> argument to the <code>tagType</code> parameter is equivalent to {@link #getAllTags()}. 302 * Specifying a <code>null</code> argument to the <code>tagType</code> parameter is equivalent to {@link #getAllTags()}.
270 * 303 *
304 * @param tagType
271 * @param tagType the {@linkplain TagType type} of tags to get. 305 * the {@linkplain TagType type} of tags to get.
272 * @return a list of all {@link Tag} objects of the specified {@linkplain TagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment. 306 * @return a list of all {@link Tag} objects of the specified {@linkplain TagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
273 * @see #getAllStartTags(StartTagType) 307 * @see #getAllStartTags(StartTagType)
274 */ 308 */
275 public List<Tag> getAllTags(final TagType tagType) { 309 public List<Tag> getAllTags(final TagType tagType)
310 {
276 Tag tag=checkTagEnclosure(Tag.getNextTag(source,begin,tagType)); 311 Tag tag = checkTagEnclosure(Tag.getNextTag(source, begin, tagType));
277 if (tag==null) return Collections.emptyList(); 312 if (tag == null) return Collections.emptyList();
278 final ArrayList<Tag> list=new ArrayList<Tag>(); 313 final ArrayList<Tag> list = new ArrayList<Tag>();
279 do { 314 do
315 {
280 list.add(tag); 316 list.add(tag);
281 tag=checkTagEnclosure(tag.getNextTag(tagType)); 317 tag = checkTagEnclosure(tag.getNextTag(tagType));
318 }
282 } while (tag!=null); 319 while (tag != null);
283 return list; 320 return list;
284 } 321 }
285 322
286 /** 323 /**
287 * Returns a list of all {@link StartTag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment. 324 * Returns a list of all {@link StartTag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
288 * <p> 325 * <p>
289 * The {@link Source#fullSequentialParse()} method should be called after construction of the {@link Source} object 326 * The {@link Source#fullSequentialParse()} method should be called after construction of the {@link Source} object if this method is to be used on a large proportion of the source. It is called automatically if this method is called on the {@link Source} object itself.
290 * if this method is to be used on a large proportion of the source.
291 * It is called automatically if this method is called on the {@link Source} object itself.
292 * <p> 327 * <p>
293 * See the {@link Tag} class documentation for more details about the behaviour of this method. 328 * See the {@link Tag} class documentation for more details about the behaviour of this method.
294 * 329 *
295 * @return a list of all {@link StartTag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment. 330 * @return a list of all {@link StartTag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
296 */ 331 */
297 public List<StartTag> getAllStartTags() { 332 public List<StartTag> getAllStartTags()
333 {
298 StartTag startTag=checkEnclosure(StartTag.getNext(source,begin)); 334 StartTag startTag = checkEnclosure(StartTag.getNext(source, begin));
299 if (startTag==null) return Collections.emptyList(); 335 if (startTag == null) return Collections.emptyList();
300 final ArrayList<StartTag> list=new ArrayList<StartTag>(); 336 final ArrayList<StartTag> list = new ArrayList<StartTag>();
301 do { 337 do
338 {
302 list.add(startTag); 339 list.add(startTag);
303 startTag=checkEnclosure(startTag.getNextStartTag()); 340 startTag = checkEnclosure(startTag.getNextStartTag());
341 }
304 } while (startTag!=null); 342 while (startTag != null);
305 return list; 343 return list;
306 } 344 }
307 345
308 /** 346 /**
309 * Returns a list of all {@link StartTag} objects of the specified {@linkplain StartTagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment. 347 * Returns a list of all {@link StartTag} objects of the specified {@linkplain StartTagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
310 * <p> 348 * <p>
311 * See the {@link Tag} class documentation for more details about the behaviour of this method. 349 * See the {@link Tag} class documentation for more details about the behaviour of this method.
312 * <p> 350 * <p>
313 * Specifying a <code>null</code> argument to the <code>startTagType</code> parameter is equivalent to {@link #getAllStartTags()}. 351 * Specifying a <code>null</code> argument to the <code>startTagType</code> parameter is equivalent to {@link #getAllStartTags()}.
314 * 352 *
353 * @param startTagType
315 * @param startTagType the {@linkplain StartTagType type} of tags to get. 354 * the {@linkplain StartTagType type} of tags to get.
316 * @return a list of all {@link StartTag} objects of the specified {@linkplain StartTagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment. 355 * @return a list of all {@link StartTag} objects of the specified {@linkplain StartTagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
317 */ 356 */
318 public List<StartTag> getAllStartTags(final StartTagType startTagType) { 357 public List<StartTag> getAllStartTags(final StartTagType startTagType)
358 {
319 if (startTagType==null) return getAllStartTags(); 359 if (startTagType == null) return getAllStartTags();
320 StartTag startTag=(StartTag)checkTagEnclosure(Tag.getNextTag(source,begin,startTagType)); 360 StartTag startTag = (StartTag) checkTagEnclosure(Tag.getNextTag(source, begin, startTagType));
321 if (startTag==null) return Collections.emptyList(); 361 if (startTag == null) return Collections.emptyList();
322 final ArrayList<StartTag> list=new ArrayList<StartTag>(); 362 final ArrayList<StartTag> list = new ArrayList<StartTag>();
323 do { 363 do
364 {
324 list.add(startTag); 365 list.add(startTag);
325 startTag=(StartTag)checkTagEnclosure(startTag.getNextTag(startTagType)); 366 startTag = (StartTag) checkTagEnclosure(startTag.getNextTag(startTagType));
367 }
326 } while (startTag!=null); 368 while (startTag != null);
327 return list; 369 return list;
328 } 370 }
329 371
330 /** 372 /**
331 * Returns a list of all {@linkplain StartTagType#NORMAL normal} {@link StartTag} objects with the specified {@linkplain StartTag#getName() name} that are {@linkplain #encloses(Segment) enclosed} by this segment. 373 * Returns a list of all {@linkplain StartTagType#NORMAL normal} {@link StartTag} objects with the specified {@linkplain StartTag#getName() name} that are {@linkplain #encloses(Segment) enclosed} by this segment.
333 * See the {@link Tag} class documentation for more details about the behaviour of this method. 375 * See the {@link Tag} class documentation for more details about the behaviour of this method.
334 * <p> 376 * <p>
335 * Specifying a <code>null</code> argument to the <code>name</code> parameter is equivalent to {@link #getAllStartTags()}, which may include non-{@linkplain StartTagType#NORMAL normal} start tags. 377 * Specifying a <code>null</code> argument to the <code>name</code> parameter is equivalent to {@link #getAllStartTags()}, which may include non-{@linkplain StartTagType#NORMAL normal} start tags.
336 * <p> 378 * <p>
337 * This method also returns {@linkplain Tag#isUnregistered() unregistered} tags if the specified name is not a valid {@linkplain Tag#isXMLName(CharSequence) XML tag name}. 379 * This method also returns {@linkplain Tag#isUnregistered() unregistered} tags if the specified name is not a valid {@linkplain Tag#isXMLName(CharSequence) XML tag name}.
338 * 380 *
381 * @param name
339 * @param name the {@linkplain StartTag#getName() name} of the start tags to get. 382 * the {@linkplain StartTag#getName() name} of the start tags to get.
340 * @return a list of all {@linkplain StartTagType#NORMAL normal} {@link StartTag} objects with the specified {@linkplain StartTag#getName() name} that are {@linkplain #encloses(Segment) enclosed} by this segment. 383 * @return a list of all {@linkplain StartTagType#NORMAL normal} {@link StartTag} objects with the specified {@linkplain StartTag#getName() name} that are {@linkplain #encloses(Segment) enclosed} by this segment.
341 */ 384 */
342 public List<StartTag> getAllStartTags(String name) { 385 public List<StartTag> getAllStartTags(String name)
386 {
343 if (name==null) return getAllStartTags(); 387 if (name == null) return getAllStartTags();
344 final boolean isXMLTagName=Tag.isXMLName(name); 388 final boolean isXMLTagName = Tag.isXMLName(name);
345 StartTag startTag=checkEnclosure(StartTag.getNext(source,begin,name,StartTagType.NORMAL,isXMLTagName)); 389 StartTag startTag = checkEnclosure(StartTag.getNext(source, begin, name, StartTagType.NORMAL, isXMLTagName));
346 if (startTag==null) return Collections.emptyList(); 390 if (startTag == null) return Collections.emptyList();
347 final ArrayList<StartTag> list=new ArrayList<StartTag>(); 391 final ArrayList<StartTag> list = new ArrayList<StartTag>();
348 do { 392 do
393 {
349 list.add(startTag); 394 list.add(startTag);
350 startTag=checkEnclosure(StartTag.getNext(source,startTag.begin+1,name,StartTagType.NORMAL,isXMLTagName)); 395 startTag = checkEnclosure(StartTag.getNext(source, startTag.begin + 1, name, StartTagType.NORMAL, isXMLTagName));
396 }
351 } while (startTag!=null); 397 while (startTag != null);
352 return list; 398 return list;
353 } 399 }
354 400
355 /** 401 /**
356 * Returns a list of all {@link StartTag} objects with the specified attribute name/value pair that are {@linkplain #encloses(Segment) enclosed} by this segment. 402 * Returns a list of all {@link StartTag} objects with the specified attribute name/value pair that are {@linkplain #encloses(Segment) enclosed} by this segment.
357 * <p> 403 * <p>
358 * See the {@link Tag} class documentation for more details about the behaviour of this method. 404 * See the {@link Tag} class documentation for more details about the behaviour of this method.
359 * 405 *
406 * @param attributeName
360 * @param attributeName the attribute name (case insensitive) to search for, must not be <code>null</code>. 407 * the attribute name (case insensitive) to search for, must not be <code>null</code>.
408 * @param value
361 * @param value the value of the specified attribute to search for, must not be <code>null</code>. 409 * the value of the specified attribute to search for, must not be <code>null</code>.
410 * @param valueCaseSensitive
362 * @param valueCaseSensitive specifies whether the attribute value matching is case sensitive. 411 * specifies whether the attribute value matching is case sensitive.
363 * @return a list of all {@link StartTag} objects with the specified attribute name/value pair that are {@linkplain #encloses(Segment) enclosed} by this segment. 412 * @return a list of all {@link StartTag} objects with the specified attribute name/value pair that are {@linkplain #encloses(Segment) enclosed} by this segment.
364 * @see #getAllStartTags(String attributeName, Pattern valueRegexPattern) 413 * @see #getAllStartTags(String attributeName, Pattern valueRegexPattern)
365 */ 414 */
366 public List<StartTag> getAllStartTags(final String attributeName, final String value, final boolean valueCaseSensitive) { 415 public List<StartTag> getAllStartTags(final String attributeName, final String value, final boolean valueCaseSensitive)
416 {
367 StartTag startTag=checkEnclosure(source.getNextStartTag(begin,attributeName,value,valueCaseSensitive)); 417 StartTag startTag = checkEnclosure(source.getNextStartTag(begin, attributeName, value, valueCaseSensitive));
368 if (startTag==null) return Collections.emptyList(); 418 if (startTag == null) return Collections.emptyList();
369 final ArrayList<StartTag> list=new ArrayList<StartTag>(); 419 final ArrayList<StartTag> list = new ArrayList<StartTag>();
370 do { 420 do
421 {
371 list.add(startTag); 422 list.add(startTag);
372 startTag=checkEnclosure(source.getNextStartTag(startTag.begin+1,attributeName,value,valueCaseSensitive)); 423 startTag = checkEnclosure(source.getNextStartTag(startTag.begin + 1, attributeName, value, valueCaseSensitive));
424 }
373 } while (startTag!=null); 425 while (startTag != null);
374 return list; 426 return list;
375 } 427 }
376 428
377 /** 429 /**
378 * Returns a list of all {@link StartTag} objects with the specified attribute name and value pattern that are {@linkplain #encloses(Segment) enclosed} by this segment. 430 * Returns a list of all {@link StartTag} objects with the specified attribute name and value pattern that are {@linkplain #encloses(Segment) enclosed} by this segment.
379 * <p> 431 * <p>
380 * Specifying a <code>null</code> argument to the <code>valueRegexPattern</code> parameter performs the search on the attribute name only, 432 * Specifying a <code>null</code> argument to the <code>valueRegexPattern</code> parameter performs the search on the attribute name only, without regard to the attribute value. This will also match an attribute that {@linkplain Attribute#hasValue() has no value} at all.
381 * without regard to the attribute value. This will also match an attribute that {@linkplain Attribute#hasValue() has no value} at all.
382 * <p> 433 * <p>
383 * See the {@link Tag} class documentation for more details about the behaviour of this method. 434 * See the {@link Tag} class documentation for more details about the behaviour of this method.
384 * 435 *
436 * @param attributeName
385 * @param attributeName the attribute name (case insensitive) to search for, must not be <code>null</code>. 437 * the attribute name (case insensitive) to search for, must not be <code>null</code>.
438 * @param valueRegexPattern
386 * @param valueRegexPattern the regular expression pattern that must match the attribute value, may be <code>null</code>. 439 * the regular expression pattern that must match the attribute value, may be <code>null</code>.
387 * @return a list of all {@link StartTag} objects with the specified attribute name and value pattern that are {@linkplain #encloses(Segment) enclosed} by this segment. 440 * @return a list of all {@link StartTag} objects with the specified attribute name and value pattern that are {@linkplain #encloses(Segment) enclosed} by this segment.
388 * @see #getAllStartTags(String attributeName, String value, boolean valueCaseSensitive) 441 * @see #getAllStartTags(String attributeName, String value, boolean valueCaseSensitive)
389 */ 442 */
390 public List<StartTag> getAllStartTags(final String attributeName, final Pattern valueRegexPattern) { 443 public List<StartTag> getAllStartTags(final String attributeName, final Pattern valueRegexPattern)
444 {
391 StartTag startTag=checkEnclosure(source.getNextStartTag(begin,attributeName,valueRegexPattern)); 445 StartTag startTag = checkEnclosure(source.getNextStartTag(begin, attributeName, valueRegexPattern));
392 if (startTag==null) return Collections.emptyList(); 446 if (startTag == null) return Collections.emptyList();
393 final ArrayList<StartTag> list=new ArrayList<StartTag>(); 447 final ArrayList<StartTag> list = new ArrayList<StartTag>();
394 do { 448 do
449 {
395 list.add(startTag); 450 list.add(startTag);
396 startTag=checkEnclosure(source.getNextStartTag(startTag.begin+1,attributeName,valueRegexPattern)); 451 startTag = checkEnclosure(source.getNextStartTag(startTag.begin + 1, attributeName, valueRegexPattern));
452 }
397 } while (startTag!=null); 453 while (startTag != null);
398 return list; 454 return list;
399 } 455 }
400 456
401 /** 457 /**
402 * Returns a list of all {@link StartTag} objects with the specified class that are {@linkplain #encloses(Segment) enclosed} by this segment. 458 * Returns a list of all {@link StartTag} objects with the specified class that are {@linkplain #encloses(Segment) enclosed} by this segment.
403 * <p> 459 * <p>
404 * This matches start tags with a <code>class</code> attribute that contains the specified class name, either as an exact match or where the specified class name is one of multiple 460 * This matches start tags with a <code>class</code> attribute that contains the specified class name, either as an exact match or where the specified class name is one of multiple class names separated by white space in the attribute value.
405 * class names separated by white space in the attribute value.
406 * <p> 461 * <p>
407 * See the {@link Tag} class documentation for more details about the behaviour of this method. 462 * See the {@link Tag} class documentation for more details about the behaviour of this method.
408 * 463 *
464 * @param className
409 * @param className the class name (case sensitive) to search for, must not be <code>null</code>. 465 * the class name (case sensitive) to search for, must not be <code>null</code>.
410 * @return a list of all {@link StartTag} objects with the specified class that are {@linkplain #encloses(Segment) enclosed} by this segment. 466 * @return a list of all {@link StartTag} objects with the specified class that are {@linkplain #encloses(Segment) enclosed} by this segment.
411 */ 467 */
412 public List<StartTag> getAllStartTagsByClass(final String className) { 468 public List<StartTag> getAllStartTagsByClass(final String className)
469 {
413 return getAllStartTags("class",getClassPattern(className)); 470 return getAllStartTags("class", getClassPattern(className));
414 } 471 }
415 472
416 /** 473 /**
417 * Returns a list of the immediate children of this segment in the document element hierarchy. 474 * Returns a list of the immediate children of this segment in the document element hierarchy.
418 * <p> 475 * <p>
419 * The returned list may include an element that extends beyond the end of this segment, as long as it begins within this segment. 476 * The returned list may include an element that extends beyond the end of this segment, as long as it begins within this segment.
420 * <p> 477 * <p>
421 * An element found at the start of this segment is included in the list. 478 * An element found at the start of this segment is included in the list. Note however that if this segment <i>is</i> an {@link Element}, the overriding {@link Element#getChildElements()} method is called instead, which only returns the children of the element.
422 * Note however that if this segment <i>is</i> an {@link Element}, the overriding {@link Element#getChildElements()} method is called instead,
423 * which only returns the children of the element.
424 * <p> 479 * <p>
425 * Calling <code>getChildElements()</code> on an <code>Element</code> is much more efficient than calling it on a <code>Segment</code>. 480 * Calling <code>getChildElements()</code> on an <code>Element</code> is much more efficient than calling it on a <code>Segment</code>.
426 * <p> 481 * <p>
427 * The objects in the list are all of type {@link Element}. 482 * The objects in the list are all of type {@link Element}.
428 * <p> 483 * <p>
429 * The {@link Source#fullSequentialParse()} method should be called after construction of the {@link Source} object 484 * The {@link Source#fullSequentialParse()} method should be called after construction of the {@link Source} object if this method is to be used on a large proportion of the source. It is called automatically if this method is called on the {@link Source} object itself.
430 * if this method is to be used on a large proportion of the source.
431 * It is called automatically if this method is called on the {@link Source} object itself.
432 * <p> 485 * <p>
433 * See the {@link Source#getChildElements()} method for more details. 486 * See the {@link Source#getChildElements()} method for more details.
434 * 487 *
435 * @return the a list of the immediate children of this segment in the document element hierarchy, guaranteed not <code>null</code>. 488 * @return the a list of the immediate children of this segment in the document element hierarchy, guaranteed not <code>null</code>.
436 * @see Element#getParentElement() 489 * @see Element#getParentElement()
437 */ 490 */
438 public List<Element> getChildElements() { 491 public List<Element> getChildElements()
492 {
439 if (length()==0) return Collections.emptyList(); 493 if (length() == 0) return Collections.emptyList();
440 List<Element> childElements=new ArrayList<Element>(); 494 List<Element> childElements = new ArrayList<Element>();
441 int pos=begin; 495 int pos = begin;
442 while (true) { 496 while (true)
497 {
443 final StartTag childStartTag=source.getNextStartTag(pos); 498 final StartTag childStartTag = source.getNextStartTag(pos);
444 if (childStartTag==null || childStartTag.begin>=end) break; 499 if (childStartTag == null || childStartTag.begin >= end) break;
445 if (!Config.IncludeServerTagsInElementHierarchy && childStartTag.getTagType().isServerTag()) { 500 if (!Config.IncludeServerTagsInElementHierarchy && childStartTag.getTagType().isServerTag())
501 {
446 pos=childStartTag.end; 502 pos = childStartTag.end;
447 continue; 503 continue;
448 } 504 }
449 final Element childElement=childStartTag.getElement(); 505 final Element childElement = childStartTag.getElement();
450 childElements.add(childElement); 506 childElements.add(childElement);
451 childElement.getChildElements(); 507 childElement.getChildElements();
452 pos=childElement.end; 508 pos = childElement.end;
453 } 509 }
454 return childElements; 510 return childElements;
455 } 511 }
456 512
457 /** 513 /**
458 * Returns a list of all {@link Element} objects that are {@linkplain #encloses(Segment) enclosed} by this segment. 514 * Returns a list of all {@link Element} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
459 * <p> 515 * <p>
460 * The {@link Source#fullSequentialParse()} method should be called after construction of the {@link Source} object 516 * The {@link Source#fullSequentialParse()} method should be called after construction of the {@link Source} object if this method is to be used on a large proportion of the source. It is called automatically if this method is called on the {@link Source} object itself.
461 * if this method is to be used on a large proportion of the source.
462 * It is called automatically if this method is called on the {@link Source} object itself.
463 * <p> 517 * <p>
464 * The elements returned correspond exactly with the start tags returned in the {@link #getAllStartTags()} method. 518 * The elements returned correspond exactly with the start tags returned in the {@link #getAllStartTags()} method.
465 * <p> 519 * <p>
466 * If this segment is itself an {@link Element}, the result includes this element in the list. 520 * If this segment is itself an {@link Element}, the result includes this element in the list.
467 * 521 *
468 * @return a list of all {@link Element} objects that are {@linkplain #encloses(Segment) enclosed} by this segment. 522 * @return a list of all {@link Element} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
469 */ 523 */
470 public List<Element> getAllElements() { 524 public List<Element> getAllElements()
525 {
471 return getAllElements(getAllStartTags()); 526 return getAllElements(getAllStartTags());
472 } 527 }
473 528
474 /** 529 /**
475 * Returns a list of all {@link Element} objects with the specified name that are {@linkplain #encloses(Segment) enclosed} by this segment. 530 * Returns a list of all {@link Element} objects with the specified name that are {@linkplain #encloses(Segment) enclosed} by this segment.
476 * <p> 531 * <p>
477 * The elements returned correspond with the start tags returned in the {@link #getAllStartTags(String name)} method, 532 * The elements returned correspond with the start tags returned in the {@link #getAllStartTags(String name)} method, except that elements which are not entirely enclosed by this segment are excluded.
478 * except that elements which are not entirely enclosed by this segment are excluded.
479 * <p> 533 * <p>
480 * Specifying a <code>null</code> argument to the <code>name</code> parameter is equivalent to {@link #getAllElements()}, which may include elements of non-{@linkplain StartTagType#NORMAL normal} tags. 534 * Specifying a <code>null</code> argument to the <code>name</code> parameter is equivalent to {@link #getAllElements()}, which may include elements of non-{@linkplain StartTagType#NORMAL normal} tags.
481 * <p> 535 * <p>
482 * This method also returns elements consisting of {@linkplain Tag#isUnregistered() unregistered} tags if the specified name is not a valid {@linkplain Tag#isXMLName(CharSequence) XML tag name}. 536 * This method also returns elements consisting of {@linkplain Tag#isUnregistered() unregistered} tags if the specified name is not a valid {@linkplain Tag#isXMLName(CharSequence) XML tag name}.
483 * <p> 537 * <p>
484 * If this segment is itself an {@link Element} with the specified name, the result includes this element in the list. 538 * If this segment is itself an {@link Element} with the specified name, the result includes this element in the list.
485 * 539 *
540 * @param name
486 * @param name the {@linkplain Element#getName() name} of the elements to get. 541 * the {@linkplain Element#getName() name} of the elements to get.
487 * @return a list of all {@link Element} objects with the specified name that are {@linkplain #encloses(Segment) enclosed} by this segment. 542 * @return a list of all {@link Element} objects with the specified name that are {@linkplain #encloses(Segment) enclosed} by this segment.
488 */ 543 */
489 public List<Element> getAllElements(String name) { 544 public List<Element> getAllElements(String name)
545 {
490 return getAllElements(getAllStartTags(name)); 546 return getAllElements(getAllStartTags(name));
491 } 547 }
492 548
493 /** 549 /**
494 * Returns a list of all {@link Element} objects with start tags of the specified {@linkplain StartTagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment. 550 * Returns a list of all {@link Element} objects with start tags of the specified {@linkplain StartTagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
495 * <p> 551 * <p>
496 * The elements returned correspond with the start tags returned in the {@link #getAllTags(TagType)} method, 552 * The elements returned correspond with the start tags returned in the {@link #getAllTags(TagType)} method, except that elements which are not entirely enclosed by this segment are excluded.
497 * except that elements which are not entirely enclosed by this segment are excluded.
498 * <p> 553 * <p>
499 * If this segment is itself an {@link Element} with the specified type, the result includes this element in the list. 554 * If this segment is itself an {@link Element} with the specified type, the result includes this element in the list.
500 * 555 *
556 * @param startTagType
501 * @param startTagType the {@linkplain StartTagType type} of start tags to get, must not be <code>null</code>. 557 * the {@linkplain StartTagType type} of start tags to get, must not be <code>null</code>.
502 * @return a list of all {@link Element} objects with start tags of the specified {@linkplain StartTagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment. 558 * @return a list of all {@link Element} objects with start tags of the specified {@linkplain StartTagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
503 */ 559 */
504 public List<Element> getAllElements(final StartTagType startTagType) { 560 public List<Element> getAllElements(final StartTagType startTagType)
561 {
505 if (startTagType==null) throw new IllegalArgumentException("startTagType argument must not be null"); 562 if (startTagType == null) throw new IllegalArgumentException("startTagType argument must not be null");
506 return getAllElements(getAllStartTags(startTagType)); 563 return getAllElements(getAllStartTags(startTagType));
507 } 564 }
508 565
509 /** 566 /**
510 * Returns a list of all {@link Element} objects with the specified attribute name/value pair that are {@linkplain #encloses(Segment) enclosed} by this segment. 567 * Returns a list of all {@link Element} objects with the specified attribute name/value pair that are {@linkplain #encloses(Segment) enclosed} by this segment.
511 * <p> 568 * <p>
512 * The elements returned correspond with the start tags returned in the {@link #getAllStartTags(String attributeName, String value, boolean valueCaseSensitive)} method, 569 * The elements returned correspond with the start tags returned in the {@link #getAllStartTags(String attributeName, String value, boolean valueCaseSensitive)} method, except that elements which are not entirely enclosed by this segment are excluded.
513 * except that elements which are not entirely enclosed by this segment are excluded.
514 * <p> 570 * <p>
515 * If this segment is itself an {@link Element} with the specified name/value pair, the result includes this element in the list. 571 * If this segment is itself an {@link Element} with the specified name/value pair, the result includes this element in the list.
516 * 572 *
573 * @param attributeName
517 * @param attributeName the attribute name (case insensitive) to search for, must not be <code>null</code>. 574 * the attribute name (case insensitive) to search for, must not be <code>null</code>.
575 * @param value
518 * @param value the value of the specified attribute to search for, must not be <code>null</code>. 576 * the value of the specified attribute to search for, must not be <code>null</code>.
577 * @param valueCaseSensitive
519 * @param valueCaseSensitive specifies whether the attribute value matching is case sensitive. 578 * specifies whether the attribute value matching is case sensitive.
520 * @return a list of all {@link Element} objects with the specified attribute name/value pair that are {@linkplain #encloses(Segment) enclosed} by this segment. 579 * @return a list of all {@link Element} objects with the specified attribute name/value pair that are {@linkplain #encloses(Segment) enclosed} by this segment.
521 * @see #getAllElements(String attributeName, Pattern valueRegexPattern) 580 * @see #getAllElements(String attributeName, Pattern valueRegexPattern)
522 */ 581 */
523 public List<Element> getAllElements(final String attributeName, final String value, final boolean valueCaseSensitive) { 582 public List<Element> getAllElements(final String attributeName, final String value, final boolean valueCaseSensitive)
583 {
524 return getAllElements(getAllStartTags(attributeName,value,valueCaseSensitive)); 584 return getAllElements(getAllStartTags(attributeName, value, valueCaseSensitive));
525 } 585 }
526 586
527 /** 587 /**
528 * Returns a list of all {@link Element} objects with the specified attribute name and value pattern that are {@linkplain #encloses(Segment) enclosed} by this segment. 588 * Returns a list of all {@link Element} objects with the specified attribute name and value pattern that are {@linkplain #encloses(Segment) enclosed} by this segment.
529 * <p> 589 * <p>
530 * The elements returned correspond with the start tags returned in the {@link #getAllStartTags(String attributeName, Pattern valueRegexPattern)} method, 590 * The elements returned correspond with the start tags returned in the {@link #getAllStartTags(String attributeName, Pattern valueRegexPattern)} method, except that elements which are not entirely enclosed by this segment are excluded.
531 * except that elements which are not entirely enclosed by this segment are excluded.
532 * <p> 591 * <p>
533 * Specifying a <code>null</code> argument to the <code>valueRegexPattern</code> parameter performs the search on the attribute name only, 592 * Specifying a <code>null</code> argument to the <code>valueRegexPattern</code> parameter performs the search on the attribute name only, without regard to the attribute value. This will also match an attribute that {@linkplain Attribute#hasValue() has no value} at all.
534 * without regard to the attribute value. This will also match an attribute that {@linkplain Attribute#hasValue() has no value} at all.
535 * <p> 593 * <p>
536 * If this segment is itself an {@link Element} with the specified attribute name and value pattern, the result includes this element in the list. 594 * If this segment is itself an {@link Element} with the specified attribute name and value pattern, the result includes this element in the list.
537 * 595 *
596 * @param attributeName
538 * @param attributeName the attribute name (case insensitive) to search for, must not be <code>null</code>. 597 * the attribute name (case insensitive) to search for, must not be <code>null</code>.
598 * @param valueRegexPattern
539 * @param valueRegexPattern the regular expression pattern that must match the attribute value, may be <code>null</code>. 599 * the regular expression pattern that must match the attribute value, may be <code>null</code>.
540 * @return a list of all {@link Element} objects with the specified attribute name and value pattern that are {@linkplain #encloses(Segment) enclosed} by this segment. 600 * @return a list of all {@link Element} objects with the specified attribute name and value pattern that are {@linkplain #encloses(Segment) enclosed} by this segment.
541 * @see #getAllElements(String attributeName, String value, boolean valueCaseSensitive) 601 * @see #getAllElements(String attributeName, String value, boolean valueCaseSensitive)
542 */ 602 */
543 public List<Element> getAllElements(final String attributeName, final Pattern valueRegexPattern) { 603 public List<Element> getAllElements(final String attributeName, final Pattern valueRegexPattern)
604 {
544 return getAllElements(getAllStartTags(attributeName,valueRegexPattern)); 605 return getAllElements(getAllStartTags(attributeName, valueRegexPattern));
545 } 606 }
546 607
547 /** 608 /**
548 * Returns a list of all {@link Element} objects with the specified class that are {@linkplain #encloses(Segment) enclosed} by this segment. 609 * Returns a list of all {@link Element} objects with the specified class that are {@linkplain #encloses(Segment) enclosed} by this segment.
549 * <p> 610 * <p>
550 * This matches elements with a <code>class</code> attribute that contains the specified class name, either as an exact match or where the specified class name is one of multiple 611 * This matches elements with a <code>class</code> attribute that contains the specified class name, either as an exact match or where the specified class name is one of multiple class names separated by white space in the attribute value.
551 * class names separated by white space in the attribute value.
552 * <p> 612 * <p>
553 * The elements returned correspond with the start tags returned in the {@link #getAllStartTagsByClass(String className)} method, 613 * The elements returned correspond with the start tags returned in the {@link #getAllStartTagsByClass(String className)} method, except that elements which are not entirely enclosed by this segment are excluded.
554 * except that elements which are not entirely enclosed by this segment are excluded.
555 * <p> 614 * <p>
556 * If this segment is itself an {@link Element} with the specified class, the result includes this element in the list. 615 * If this segment is itself an {@link Element} with the specified class, the result includes this element in the list.
557 * 616 *
617 * @param className
558 * @param className the class name (case sensitive) to search for, must not be <code>null</code>. 618 * the class name (case sensitive) to search for, must not be <code>null</code>.
559 * @return a list of all {@link Element} objects with the specified class that are {@linkplain #encloses(Segment) enclosed} by this segment. 619 * @return a list of all {@link Element} objects with the specified class that are {@linkplain #encloses(Segment) enclosed} by this segment.
560 */ 620 */
561 public List<Element> getAllElementsByClass(final String className) { 621 public List<Element> getAllElementsByClass(final String className)
622 {
562 return getAllElements(getAllStartTagsByClass(className)); 623 return getAllElements(getAllStartTagsByClass(className));
563 } 624 }
564 625
565 /** 626 /**
566 * Returns a list of all {@link CharacterReference} objects that are {@linkplain #encloses(Segment) enclosed} by this segment. 627 * Returns a list of all {@link CharacterReference} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
628 *
567 * @return a list of all {@link CharacterReference} objects that are {@linkplain #encloses(Segment) enclosed} by this segment. 629 * @return a list of all {@link CharacterReference} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
568 */ 630 */
569 public List<CharacterReference> getAllCharacterReferences() { 631 public List<CharacterReference> getAllCharacterReferences()
632 {
570 CharacterReference characterReference=getNextCharacterReference(begin); 633 CharacterReference characterReference = getNextCharacterReference(begin);
571 if (characterReference==null) return Collections.emptyList(); 634 if (characterReference == null) return Collections.emptyList();
572 final ArrayList<CharacterReference> list=new ArrayList<CharacterReference>(); 635 final ArrayList<CharacterReference> list = new ArrayList<CharacterReference>();
573 do { 636 do
637 {
574 list.add(characterReference); 638 list.add(characterReference);
575 characterReference=getNextCharacterReference(characterReference.end); 639 characterReference = getNextCharacterReference(characterReference.end);
640 }
576 } while (characterReference!=null); 641 while (characterReference != null);
577 return list; 642 return list;
578 } 643 }
579 644
580 /** 645 /**
581 * Returns a list of all {@linkplain Attribute attributes} {@linkplain #encloses(Segment) enclosed} by this segment that have <a target="_blank" href="http://en.wikipedia.org/wiki/URI">URI</a> values. 646 * Returns a list of all {@linkplain Attribute attributes} {@linkplain #encloses(Segment) enclosed} by this segment that have <a target="_blank" href="http://en.wikipedia.org/wiki/URI">URI</a> values.
582 * <p> 647 * <p>
583 * According to the <a target="_blank" href="http://www.w3.org/TR/html401/">HTML 4.01 specification</a>, the following attributes have URI values: 648 * According to the <a target="_blank" href="http://www.w3.org/TR/html401/">HTML 4.01 specification</a>, the following attributes have URI values:
584 * <table class="bordered" cellspacing="0"> 649 * <table class="bordered" cellspacing="0">
585 * <tr><th>HTML element name<th>Attribute name 650 * <tr>
651 * <th>HTML element name
652 * <th>Attribute name
653 * <tr>
586 * <tr><td>{@link HTMLElementName#A A}<td>href 654 * <td>{@link HTMLElementName#A A}
655 * <td>href
656 * <tr>
587 * <tr><td>{@link HTMLElementName#APPLET APPLET}<td>codebase 657 * <td>{@link HTMLElementName#APPLET APPLET}
658 * <td>codebase
659 * <tr>
588 * <tr><td>{@link HTMLElementName#AREA AREA}<td>href 660 * <td>{@link HTMLElementName#AREA AREA}
661 * <td>href
662 * <tr>
589 * <tr><td>{@link HTMLElementName#BASE BASE}<td>href 663 * <td>{@link HTMLElementName#BASE BASE}
664 * <td>href
665 * <tr>
590 * <tr><td>{@link HTMLElementName#BLOCKQUOTE BLOCKQUOTE}<td>cite 666 * <td>{@link HTMLElementName#BLOCKQUOTE BLOCKQUOTE}
667 * <td>cite
668 * <tr>
591 * <tr><td>{@link HTMLElementName#BODY BODY}<td>background 669 * <td>{@link HTMLElementName#BODY BODY}
670 * <td>background
671 * <tr>
592 * <tr><td>{@link HTMLElementName#FORM FORM}<td>action 672 * <td>{@link HTMLElementName#FORM FORM}
593 * <tr><td>{@link HTMLElementName#FRAME FRAME}<td>longdesc 673 * <td>action
674 * <tr>
594 * <tr><td>{@link HTMLElementName#FRAME FRAME}<td>src 675 * <td>{@link HTMLElementName#FRAME FRAME}
676 * <td>longdesc
677 * <tr>
678 * <td>{@link HTMLElementName#FRAME FRAME}
679 * <td>src
680 * <tr>
595 * <tr><td>{@link HTMLElementName#DEL DEL}<td>cite 681 * <td>{@link HTMLElementName#DEL DEL}
682 * <td>cite
683 * <tr>
596 * <tr><td>{@link HTMLElementName#HEAD HEAD}<td>profile 684 * <td>{@link HTMLElementName#HEAD HEAD}
597 * <tr><td>{@link HTMLElementName#IFRAME IFRAME}<td>longdesc 685 * <td>profile
686 * <tr>
598 * <tr><td>{@link HTMLElementName#IFRAME IFRAME}<td>src 687 * <td>{@link HTMLElementName#IFRAME IFRAME}
599 * <tr><td>{@link HTMLElementName#IMG IMG}<td>longdesc 688 * <td>longdesc
689 * <tr>
690 * <td>{@link HTMLElementName#IFRAME IFRAME}
691 * <td>src
692 * <tr>
600 * <tr><td>{@link HTMLElementName#IMG IMG}<td>src 693 * <td>{@link HTMLElementName#IMG IMG}
694 * <td>longdesc
695 * <tr>
601 * <tr><td>{@link HTMLElementName#IMG IMG}<td>usemap 696 * <td>{@link HTMLElementName#IMG IMG}
697 * <td>src
698 * <tr>
699 * <td>{@link HTMLElementName#IMG IMG}
700 * <td>usemap
701 * <tr>
602 * <tr><td>{@link HTMLElementName#INPUT INPUT}<td>src 702 * <td>{@link HTMLElementName#INPUT INPUT}
703 * <td>src
704 * <tr>
603 * <tr><td>{@link HTMLElementName#INPUT INPUT}<td>usemap 705 * <td>{@link HTMLElementName#INPUT INPUT}
706 * <td>usemap
707 * <tr>
604 * <tr><td>{@link HTMLElementName#INS INS}<td>cite 708 * <td>{@link HTMLElementName#INS INS}
709 * <td>cite
710 * <tr>
605 * <tr><td>{@link HTMLElementName#LINK LINK}<td>href 711 * <td>{@link HTMLElementName#LINK LINK}
606 * <tr><td>{@link HTMLElementName#OBJECT OBJECT}<td>classid 712 * <td>href
607 * <tr><td>{@link HTMLElementName#OBJECT OBJECT}<td>codebase 713 * <tr>
608 * <tr><td>{@link HTMLElementName#OBJECT OBJECT}<td>data 714 * <td>{@link HTMLElementName#OBJECT OBJECT}
715 * <td>classid
716 * <tr>
609 * <tr><td>{@link HTMLElementName#OBJECT OBJECT}<td>usemap 717 * <td>{@link HTMLElementName#OBJECT OBJECT}
718 * <td>codebase
719 * <tr>
720 * <td>{@link HTMLElementName#OBJECT OBJECT}
721 * <td>data
722 * <tr>
723 * <td>{@link HTMLElementName#OBJECT OBJECT}
724 * <td>usemap
725 * <tr>
610 * <tr><td>{@link HTMLElementName#Q Q}<td>cite 726 * <td>{@link HTMLElementName#Q Q}
727 * <td>cite
728 * <tr>
611 * <tr><td>{@link HTMLElementName#SCRIPT SCRIPT}<td>src 729 * <td>{@link HTMLElementName#SCRIPT SCRIPT}
730 * <td>src
612 * </table> 731 * </table>
613 * <p> 732 * <p>
614 * Attributes from other elements may also be returned if the attribute name matches one of those in the list above. 733 * Attributes from other elements may also be returned if the attribute name matches one of those in the list above.
615 * <p> 734 * <p>
616 * This method is often used in conjunction with the {@link #getStyleURISegments()} method in order to find all URIs in a document. 735 * This method is often used in conjunction with the {@link #getStyleURISegments()} method in order to find all URIs in a document.
617 * <p> 736 * <p>
618 * The attributes are returned in order of appearance. 737 * The attributes are returned in order of appearance.
619 * 738 *
620 * @return a list of all {@linkplain Attribute attributes} {@linkplain #encloses(Segment) enclosed} by this segment that have <a target="_blank" href="http://en.wikipedia.org/wiki/URI">URI</a> values. 739 * @return a list of all {@linkplain Attribute attributes} {@linkplain #encloses(Segment) enclosed} by this segment that have <a target="_blank" href="http://en.wikipedia.org/wiki/URI">URI</a> values.
621 * @see #getStyleURISegments() 740 * @see #getStyleURISegments()
622 */ 741 */
623 public List<Attribute> getURIAttributes() { 742 public List<Attribute> getURIAttributes()
743 {
624 return URIAttributes.getList(this); 744 return URIAttributes.getList(this);
625 } 745 }
626 746
627 /** 747 /**
628 * Returns a list of all <a target="_blank" href="http://en.wikipedia.org/wiki/URI">URI</a> {@linkplain Segment segments} 748 * Returns a list of all <a target="_blank" href="http://en.wikipedia.org/wiki/URI">URI</a> {@linkplain Segment segments} inside <code>style</code> attribute values {@linkplain #encloses(Segment) enclosed} by this segment.
629 * inside <code>style</code> attribute values {@linkplain #encloses(Segment) enclosed} by this segment.
630 * <p> 749 * <p>
631 * If this segment does not contain any tags, the entire segment is assumed to be a <code>style</code> attribute value. 750 * If this segment does not contain any tags, the entire segment is assumed to be a <code>style</code> attribute value.
632 * <p> 751 * <p>
633 * The URI segments are found by searching the <code>style</code> attribute values for the functional notation "<code>url()</code>" as described in 752 * The URI segments are found by searching the <code>style</code> attribute values for the functional notation "<code>url()</code>" as described in <a target="_blank" href="http://www.w3.org/TR/CSS2/syndata.html#uri">section 4.3.4 of the CSS2 specification</a>.
634 * <a target="_blank" href="http://www.w3.org/TR/CSS2/syndata.html#uri">section 4.3.4 of the CSS2 specification</a>.
635 * <p> 753 * <p>
636 * The segments are returned in order of appearance. 754 * The segments are returned in order of appearance.
637 * 755 *
638 * @return a list of all <a target="_blank" href="http://en.wikipedia.org/wiki/URI">URI</a> {@linkplain Segment segments} inside <code>style</code> attribute values {@linkplain #encloses(Segment) enclosed} by this segment. 756 * @return a list of all <a target="_blank" href="http://en.wikipedia.org/wiki/URI">URI</a> {@linkplain Segment segments} inside <code>style</code> attribute values {@linkplain #encloses(Segment) enclosed} by this segment.
639 * @see #getURIAttributes() 757 * @see #getURIAttributes()
640 */ 758 */
641 public List<Segment> getStyleURISegments() { 759 public List<Segment> getStyleURISegments()
760 {
642 return URIAttributes.getStyleURISegments(this); 761 return URIAttributes.getStyleURISegments(this);
643 } 762 }
644 763
645 /** 764 /**
646 * Returns the first {@link StartTag} {@linkplain #encloses(Segment) enclosed} by this segment. 765 * Returns the first {@link StartTag} {@linkplain #encloses(Segment) enclosed} by this segment.
647 * <p> 766 * <p>
648 * This is functionally equivalent to {@link #getAllStartTags()}<code>.iterator().next()</code>, 767 * This is functionally equivalent to {@link #getAllStartTags()}<code>.iterator().next()</code>, but does not search beyond the first start tag and returns <code>null</code> if no such start tag exists.
649 * but does not search beyond the first start tag and returns <code>null</code> if no such start tag exists.
650 * 768 *
651 * @return the first {@link StartTag} {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists. 769 * @return the first {@link StartTag} {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
652 */ 770 */
653 public final StartTag getFirstStartTag() { 771 public final StartTag getFirstStartTag()
772 {
654 return checkEnclosure(source.getNextStartTag(begin)); 773 return checkEnclosure(source.getNextStartTag(begin));
655 } 774 }
656 775
657 /** 776 /**
658 * Returns the first {@link StartTag} of the specified {@linkplain StartTagType type} {@linkplain #encloses(Segment) enclosed} by this segment. 777 * Returns the first {@link StartTag} of the specified {@linkplain StartTagType type} {@linkplain #encloses(Segment) enclosed} by this segment.
659 * <p> 778 * <p>
660 * This is functionally equivalent to {@link #getAllStartTags(StartTagType) getAllStartTags(startTagType)}<code>.iterator().next()</code>, 779 * This is functionally equivalent to {@link #getAllStartTags(StartTagType) getAllStartTags(startTagType)}<code>.iterator().next()</code>, but does not search beyond the first start tag and returns <code>null</code> if no such start tag exists.
661 * but does not search beyond the first start tag and returns <code>null</code> if no such start tag exists.
662 * 780 *
781 * @param startTagType
663 * @param startTagType the <code>StartTagType</code> to search for. 782 * the <code>StartTagType</code> to search for.
664 * @return the first {@link StartTag} of the specified {@linkplain StartTagType type} {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists. 783 * @return the first {@link StartTag} of the specified {@linkplain StartTagType type} {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
665 */ 784 */
666 public final StartTag getFirstStartTag(StartTagType startTagType) { 785 public final StartTag getFirstStartTag(StartTagType startTagType)
786 {
667 return checkEnclosure(source.getNextStartTag(begin,startTagType)); 787 return checkEnclosure(source.getNextStartTag(begin, startTagType));
668 }
669 788 }
789
670 /** 790 /**
671 * Returns the first {@linkplain StartTagType#NORMAL normal} {@link StartTag} {@linkplain #encloses(Segment) enclosed} by this segment. 791 * Returns the first {@linkplain StartTagType#NORMAL normal} {@link StartTag} {@linkplain #encloses(Segment) enclosed} by this segment.
672 * <p> 792 * <p>
673 * This is functionally equivalent to {@link #getAllStartTags(String) getAllStartTags(name)}<code>.iterator().next()</code>, 793 * This is functionally equivalent to {@link #getAllStartTags(String) getAllStartTags(name)}<code>.iterator().next()</code>, but does not search beyond the first start tag and returns <code>null</code> if no such start tag exists.
674 * but does not search beyond the first start tag and returns <code>null</code> if no such start tag exists.
675 * <p> 794 * <p>
676 * Specifying a <code>null</code> argument to the <code>name</code> parameter is equivalent to {@link #getFirstStartTag()}. 795 * Specifying a <code>null</code> argument to the <code>name</code> parameter is equivalent to {@link #getFirstStartTag()}.
677 * 796 *
797 * @param name
678 * @param name the {@linkplain StartTag#getName() name} of the start tag to search for, may be <code>null</code>. 798 * the {@linkplain StartTag#getName() name} of the start tag to search for, may be <code>null</code>.
679 * @return the first {@linkplain StartTagType#NORMAL normal} {@link StartTag} {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists. 799 * @return the first {@linkplain StartTagType#NORMAL normal} {@link StartTag} {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
680 */ 800 */
681 public final StartTag getFirstStartTag(String name) { 801 public final StartTag getFirstStartTag(String name)
802 {
682 return checkEnclosure(source.getNextStartTag(begin,name)); 803 return checkEnclosure(source.getNextStartTag(begin, name));
683 }
684 804 }
805
685 /** 806 /**
686 * Returns the first {@link StartTag} with the specified attribute name/value pair {@linkplain #encloses(Segment) enclosed} by this segment. 807 * Returns the first {@link StartTag} with the specified attribute name/value pair {@linkplain #encloses(Segment) enclosed} by this segment.
687 * <p> 808 * <p>
688 * This is functionally equivalent to {@link #getAllStartTags(String,String,boolean) getAllStartTags(attributeName,value,valueCaseSensitive)}<code>.iterator().next()</code>, 809 * This is functionally equivalent to {@link #getAllStartTags(String,String,boolean) getAllStartTags(attributeName,value,valueCaseSensitive)}<code>.iterator().next()</code>, but does not search beyond the first start tag and returns <code>null</code> if no such start tag exists.
689 * but does not search beyond the first start tag and returns <code>null</code> if no such start tag exists.
690 * 810 *
811 * @param attributeName
691 * @param attributeName the attribute name (case insensitive) to search for, must not be <code>null</code>. 812 * the attribute name (case insensitive) to search for, must not be <code>null</code>.
813 * @param value
692 * @param value the value of the specified attribute to search for, must not be <code>null</code>. 814 * the value of the specified attribute to search for, must not be <code>null</code>.
815 * @param valueCaseSensitive
693 * @param valueCaseSensitive specifies whether the attribute value matching is case sensitive. 816 * specifies whether the attribute value matching is case sensitive.
694 * @return the first {@link StartTag} with the specified attribute name/value pair {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists. 817 * @return the first {@link StartTag} with the specified attribute name/value pair {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
695 * @see #getFirstStartTag(String attributeName, Pattern valueRegexPattern) 818 * @see #getFirstStartTag(String attributeName, Pattern valueRegexPattern)
696 */ 819 */
697 public final StartTag getFirstStartTag(String attributeName, String value, boolean valueCaseSensitive) { 820 public final StartTag getFirstStartTag(String attributeName, String value, boolean valueCaseSensitive)
821 {
698 return checkEnclosure(source.getNextStartTag(begin,attributeName,value,valueCaseSensitive)); 822 return checkEnclosure(source.getNextStartTag(begin, attributeName, value, valueCaseSensitive));
699 } 823 }
700 824
701 /** 825 /**
702 * Returns the first {@link StartTag} with the specified attribute name and value pattern that is {@linkplain #encloses(Segment) enclosed} by this segment. 826 * Returns the first {@link StartTag} with the specified attribute name and value pattern that is {@linkplain #encloses(Segment) enclosed} by this segment.
703 * <p> 827 * <p>
704 * This is functionally equivalent to {@link #getAllStartTags(String,Pattern) getAllStartTags(attributeName,valueRegexPattern)}<code>.iterator().next()</code>, 828 * This is functionally equivalent to {@link #getAllStartTags(String,Pattern) getAllStartTags(attributeName,valueRegexPattern)}<code>.iterator().next()</code>, but does not search beyond the first start tag and returns <code>null</code> if no such start tag exists.
705 * but does not search beyond the first start tag and returns <code>null</code> if no such start tag exists.
706 * 829 *
830 * @param attributeName
707 * @param attributeName the attribute name (case insensitive) to search for, must not be <code>null</code>. 831 * the attribute name (case insensitive) to search for, must not be <code>null</code>.
832 * @param valueRegexPattern
708 * @param valueRegexPattern the regular expression pattern that must match the attribute value, may be <code>null</code>. 833 * the regular expression pattern that must match the attribute value, may be <code>null</code>.
709 * @return the first {@link StartTag} with the specified attribute name and value pattern that is {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists. 834 * @return the first {@link StartTag} with the specified attribute name and value pattern that is {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
710 * @see #getFirstStartTag(String attributeName, String value, boolean valueCaseSensitive) 835 * @see #getFirstStartTag(String attributeName, String value, boolean valueCaseSensitive)
711 */ 836 */
712 public final StartTag getFirstStartTag(final String attributeName, final Pattern valueRegexPattern) { 837 public final StartTag getFirstStartTag(final String attributeName, final Pattern valueRegexPattern)
838 {
713 return checkEnclosure(source.getNextStartTag(begin,attributeName,valueRegexPattern)); 839 return checkEnclosure(source.getNextStartTag(begin, attributeName, valueRegexPattern));
714 } 840 }
715 841
716 /** 842 /**
717 * Returns the first {@link StartTag} with the specified class that is {@linkplain #encloses(Segment) enclosed} by this segment. 843 * Returns the first {@link StartTag} with the specified class that is {@linkplain #encloses(Segment) enclosed} by this segment.
718 * <p> 844 * <p>
719 * This is functionally equivalent to {@link #getAllStartTagsByClass(String) getAllStartTagsByClass(className)}<code>.iterator().next()</code>, 845 * This is functionally equivalent to {@link #getAllStartTagsByClass(String) getAllStartTagsByClass(className)}<code>.iterator().next()</code>, but does not search beyond the first start tag and returns <code>null</code> if no such start tag exists.
720 * but does not search beyond the first start tag and returns <code>null</code> if no such start tag exists.
721 * 846 *
847 * @param className
722 * @param className the class name (case sensitive) to search for, must not be <code>null</code>. 848 * the class name (case sensitive) to search for, must not be <code>null</code>.
723 * @return the first {@link StartTag} with the specified class that is {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists. 849 * @return the first {@link StartTag} with the specified class that is {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
724 */ 850 */
725 public final StartTag getFirstStartTagByClass(final String className) { 851 public final StartTag getFirstStartTagByClass(final String className)
852 {
726 return checkEnclosure(source.getNextStartTagByClass(begin,className)); 853 return checkEnclosure(source.getNextStartTagByClass(begin, className));
727 }
728 854 }
855
729 /** 856 /**
730 * Returns the first {@link Element} {@linkplain #encloses(Segment) enclosed} by this segment. 857 * Returns the first {@link Element} {@linkplain #encloses(Segment) enclosed} by this segment.
731 * <p> 858 * <p>
732 * This is functionally equivalent to {@link #getAllElements()}<code>.iterator().next()</code>, 859 * This is functionally equivalent to {@link #getAllElements()}<code>.iterator().next()</code>, but does not search beyond the first enclosed element and returns <code>null</code> if no such element exists.
733 * but does not search beyond the first enclosed element and returns <code>null</code> if no such element exists.
734 * <p> 860 * <p>
735 * If this segment is itself an {@link Element}, this element is returned, not the first child element. 861 * If this segment is itself an {@link Element}, this element is returned, not the first child element.
736 * 862 *
737 * @return the first {@link Element} {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists. 863 * @return the first {@link Element} {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
738 */ 864 */
739 public final Element getFirstElement() { 865 public final Element getFirstElement()
866 {
740 StartTag startTag=checkEnclosure(StartTag.getNext(source,begin)); 867 StartTag startTag = checkEnclosure(StartTag.getNext(source, begin));
741 while (startTag!=null) { 868 while (startTag != null)
869 {
742 final Element element=startTag.getElement(); 870 final Element element = startTag.getElement();
743 if (element.end<=end) return element; 871 if (element.end <= end) return element;
744 startTag=checkEnclosure(startTag.getNextStartTag()); 872 startTag = checkEnclosure(startTag.getNextStartTag());
745 } 873 }
746 return null; 874 return null;
747 } 875 }
748 876
749 /** 877 /**
750 * Returns the first {@linkplain StartTagType#NORMAL normal} {@link Element} with the specified {@linkplain Element#getName() name} {@linkplain #encloses(Segment) enclosed} by this segment. 878 * Returns the first {@linkplain StartTagType#NORMAL normal} {@link Element} with the specified {@linkplain Element#getName() name} {@linkplain #encloses(Segment) enclosed} by this segment.
751 * <p> 879 * <p>
752 * This is functionally equivalent to {@link #getAllElements(String) getAllElements(name)}<code>.iterator().next()</code>, 880 * This is functionally equivalent to {@link #getAllElements(String) getAllElements(name)}<code>.iterator().next()</code>, but does not search beyond the first enclosed element and returns <code>null</code> if no such element exists.
753 * but does not search beyond the first enclosed element and returns <code>null</code> if no such element exists.
754 * <p> 881 * <p>
755 * Specifying a <code>null</code> argument to the <code>name</code> parameter is equivalent to {@link #getFirstElement()}. 882 * Specifying a <code>null</code> argument to the <code>name</code> parameter is equivalent to {@link #getFirstElement()}.
756 * <p> 883 * <p>
757 * If this segment is itself an {@link Element} with the specified name, this element is returned. 884 * If this segment is itself an {@link Element} with the specified name, this element is returned.
758 * 885 *
886 * @param name
759 * @param name the {@linkplain Element#getName() name} of the element to search for. 887 * the {@linkplain Element#getName() name} of the element to search for.
760 * @return the first {@linkplain StartTagType#NORMAL normal} {@link Element} with the specified {@linkplain Element#getName() name} {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists. 888 * @return the first {@linkplain StartTagType#NORMAL normal} {@link Element} with the specified {@linkplain Element#getName() name} {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
761 */ 889 */
762 public final Element getFirstElement(String name) { 890 public final Element getFirstElement(String name)
891 {
763 if (name==null) return getFirstElement(); 892 if (name == null) return getFirstElement();
764 final boolean isXMLTagName=Tag.isXMLName(name); 893 final boolean isXMLTagName = Tag.isXMLName(name);
765 StartTag startTag=checkEnclosure(StartTag.getNext(source,begin,name,StartTagType.NORMAL,isXMLTagName)); 894 StartTag startTag = checkEnclosure(StartTag.getNext(source, begin, name, StartTagType.NORMAL, isXMLTagName));
766 while (startTag!=null) { 895 while (startTag != null)
896 {
767 final Element element=startTag.getElement(); 897 final Element element = startTag.getElement();
768 if (element.end<=end) return element; 898 if (element.end <= end) return element;
769 startTag=checkEnclosure(StartTag.getNext(source,startTag.begin+1,name,StartTagType.NORMAL,isXMLTagName)); 899 startTag = checkEnclosure(StartTag.getNext(source, startTag.begin + 1, name, StartTagType.NORMAL, isXMLTagName));
770 } 900 }
771 return null; 901 return null;
772 } 902 }
773 903
774 /** 904 /**
775 * Returns the first {@link Element} with the specified attribute name/value pair {@linkplain #encloses(Segment) enclosed} by this segment. 905 * Returns the first {@link Element} with the specified attribute name/value pair {@linkplain #encloses(Segment) enclosed} by this segment.
776 * <p> 906 * <p>
777 * This is functionally equivalent to {@link #getAllElements(String,String,boolean) getAllElements(attributeName,value,valueCaseSensitive)}<code>.iterator().next()</code>, 907 * This is functionally equivalent to {@link #getAllElements(String,String,boolean) getAllElements(attributeName,value,valueCaseSensitive)}<code>.iterator().next()</code>, but does not search beyond the first enclosed element and returns <code>null</code> if no such element exists.
778 * but does not search beyond the first enclosed element and returns <code>null</code> if no such element exists.
779 * <p> 908 * <p>
780 * If this segment is itself an {@link Element} with the specified attribute name/value pair, this element is returned. 909 * If this segment is itself an {@link Element} with the specified attribute name/value pair, this element is returned.
781 * 910 *
911 * @param attributeName
782 * @param attributeName the attribute name (case insensitive) to search for, must not be <code>null</code>. 912 * the attribute name (case insensitive) to search for, must not be <code>null</code>.
913 * @param value
783 * @param value the value of the specified attribute to search for, must not be <code>null</code>. 914 * the value of the specified attribute to search for, must not be <code>null</code>.
915 * @param valueCaseSensitive
784 * @param valueCaseSensitive specifies whether the attribute value matching is case sensitive. 916 * specifies whether the attribute value matching is case sensitive.
785 * @return the first {@link Element} with the specified attribute name/value pair {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists. 917 * @return the first {@link Element} with the specified attribute name/value pair {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
786 * @see #getFirstElement(String attributeName, Pattern valueRegexPattern) 918 * @see #getFirstElement(String attributeName, Pattern valueRegexPattern)
787 */ 919 */
788 public final Element getFirstElement(String attributeName, String value, boolean valueCaseSensitive) { 920 public final Element getFirstElement(String attributeName, String value, boolean valueCaseSensitive)
921 {
789 StartTag startTag=checkEnclosure(source.getNextStartTag(begin,attributeName,value,valueCaseSensitive)); 922 StartTag startTag = checkEnclosure(source.getNextStartTag(begin, attributeName, value, valueCaseSensitive));
790 while (startTag!=null) { 923 while (startTag != null)
924 {
791 final Element element=startTag.getElement(); 925 final Element element = startTag.getElement();
792 if (element.end<=end) return element; 926 if (element.end <= end) return element;
793 startTag=checkEnclosure(source.getNextStartTag(startTag.begin+1,attributeName,value,valueCaseSensitive)); 927 startTag = checkEnclosure(source.getNextStartTag(startTag.begin + 1, attributeName, value, valueCaseSensitive));
794 } 928 }
795 return null; 929 return null;
796 } 930 }
797 931
798 /** 932 /**
799 * Returns the first {@link Element} with the specified attribute name and value pattern that is {@linkplain #encloses(Segment) enclosed} by this segment. 933 * Returns the first {@link Element} with the specified attribute name and value pattern that is {@linkplain #encloses(Segment) enclosed} by this segment.
800 * <p> 934 * <p>
801 * This is functionally equivalent to {@link #getAllElements(String,Pattern) getAllElements(attributeName,valueRegexPattern)}<code>.iterator().next()</code>, 935 * This is functionally equivalent to {@link #getAllElements(String,Pattern) getAllElements(attributeName,valueRegexPattern)}<code>.iterator().next()</code>, but does not search beyond the first enclosed element and returns <code>null</code> if no such element exists.
802 * but does not search beyond the first enclosed element and returns <code>null</code> if no such element exists.
803 * <p> 936 * <p>
804 * If this segment is itself an {@link Element} with the specified attribute name and value pattern, this element is returned. 937 * If this segment is itself an {@link Element} with the specified attribute name and value pattern, this element is returned.
805 * 938 *
939 * @param attributeName
806 * @param attributeName the attribute name (case insensitive) to search for, must not be <code>null</code>. 940 * the attribute name (case insensitive) to search for, must not be <code>null</code>.
941 * @param valueRegexPattern
807 * @param valueRegexPattern the regular expression pattern that must match the attribute value, may be <code>null</code>. 942 * the regular expression pattern that must match the attribute value, may be <code>null</code>.
808 * @return the first {@link Element} with the specified attribute name and value pattern that is {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists. 943 * @return the first {@link Element} with the specified attribute name and value pattern that is {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
809 * @see #getFirstElement(String attributeName, String value, boolean valueCaseSensitive) 944 * @see #getFirstElement(String attributeName, String value, boolean valueCaseSensitive)
810 */ 945 */
811 public final Element getFirstElement(final String attributeName, final Pattern valueRegexPattern) { 946 public final Element getFirstElement(final String attributeName, final Pattern valueRegexPattern)
947 {
812 StartTag startTag=checkEnclosure(source.getNextStartTag(begin,attributeName,valueRegexPattern)); 948 StartTag startTag = checkEnclosure(source.getNextStartTag(begin, attributeName, valueRegexPattern));
813 while (startTag!=null) { 949 while (startTag != null)
950 {
814 final Element element=startTag.getElement(); 951 final Element element = startTag.getElement();
815 if (element.end<=end) return element; 952 if (element.end <= end) return element;
816 startTag=checkEnclosure(source.getNextStartTag(startTag.begin+1,attributeName,valueRegexPattern)); 953 startTag = checkEnclosure(source.getNextStartTag(startTag.begin + 1, attributeName, valueRegexPattern));
817 } 954 }
818 return null; 955 return null;
819 } 956 }
820 957
821 /** 958 /**
822 * Returns the first {@link Element} with the specified class that is {@linkplain #encloses(Segment) enclosed} by this segment. 959 * Returns the first {@link Element} with the specified class that is {@linkplain #encloses(Segment) enclosed} by this segment.
823 * <p> 960 * <p>
824 * This is functionally equivalent to {@link #getAllElementsByClass(String) getAllElementsByClass(className)}<code>.iterator().next()</code>, 961 * This is functionally equivalent to {@link #getAllElementsByClass(String) getAllElementsByClass(className)}<code>.iterator().next()</code>, but does not search beyond the first enclosed element and returns <code>null</code> if no such element exists.
825 * but does not search beyond the first enclosed element and returns <code>null</code> if no such element exists.
826 * <p> 962 * <p>
827 * If this segment is itself an {@link Element} with the specified class, this element is returned. 963 * If this segment is itself an {@link Element} with the specified class, this element is returned.
828 * 964 *
965 * @param className
829 * @param className the class name (case sensitive) to search for, must not be <code>null</code>. 966 * the class name (case sensitive) to search for, must not be <code>null</code>.
830 * @return the first {@link Element} with the specified class that is {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists. 967 * @return the first {@link Element} with the specified class that is {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
831 */ 968 */
832 public final Element getFirstElementByClass(final String className) { 969 public final Element getFirstElementByClass(final String className)
970 {
833 StartTag startTag=checkEnclosure(source.getNextStartTagByClass(begin,className)); 971 StartTag startTag = checkEnclosure(source.getNextStartTagByClass(begin, className));
834 while (startTag!=null) { 972 while (startTag != null)
973 {
835 final Element element=startTag.getElement(); 974 final Element element = startTag.getElement();
836 if (element.end<=end) return element; 975 if (element.end <= end) return element;
837 startTag=checkEnclosure(source.getNextStartTagByClass(startTag.begin+1,className)); 976 startTag = checkEnclosure(source.getNextStartTagByClass(startTag.begin + 1, className));
838 } 977 }
839 return null; 978 return null;
840 } 979 }
841 980
842 /** 981 /**
843 * Returns a list of the {@link FormControl} objects that are {@linkplain #encloses(Segment) enclosed} by this segment. 982 * Returns a list of the {@link FormControl} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
983 *
844 * @return a list of the {@link FormControl} objects that are {@linkplain #encloses(Segment) enclosed} by this segment. 984 * @return a list of the {@link FormControl} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
845 */ 985 */
846 public List<FormControl> getFormControls() { 986 public List<FormControl> getFormControls()
987 {
847 return FormControl.getAll(this); 988 return FormControl.getAll(this);
848 } 989 }
849 990
850 /** 991 /**
851 * Returns the {@link FormFields} object representing all form fields that are {@linkplain #encloses(Segment) enclosed} by this segment. 992 * Returns the {@link FormFields} object representing all form fields that are {@linkplain #encloses(Segment) enclosed} by this segment.
852 * <p> 993 * <p>
853 * This is equivalent to {@link FormFields#FormFields(Collection) new FormFields}<code>(</code>{@link #getFormControls()}<code>)</code>. 994 * This is equivalent to {@link FormFields#FormFields(Collection) new FormFields}<code>(</code>{@link #getFormControls()}<code>)</code>.
854 * 995 *
855 * @return the {@link FormFields} object representing all form fields that are {@linkplain #encloses(Segment) enclosed} by this segment. 996 * @return the {@link FormFields} object representing all form fields that are {@linkplain #encloses(Segment) enclosed} by this segment.
856 * @see #getFormControls() 997 * @see #getFormControls()
857 */ 998 */
858 public FormFields getFormFields() { 999 public FormFields getFormFields()
1000 {
859 return new FormFields(getFormControls()); 1001 return new FormFields(getFormControls());
860 } 1002 }
861 1003
862 /** 1004 /**
863 * Parses any {@link Attributes} within this segment. 1005 * Parses any {@link Attributes} within this segment.
864 * This method is only used in the unusual situation where attributes exist outside of a start tag. 1006 * This method is only used in the unusual situation where attributes exist outside of a start tag.
865 * The {@link StartTag#getAttributes()} method should be used in normal situations. 1007 * The {@link StartTag#getAttributes()} method should be used in normal situations.
866 * <p> 1008 * <p>
867 * This is equivalent to <code>source.</code>{@link Source#parseAttributes(int,int) parseAttributes}<code>(</code>{@link #getBegin()}<code>,</code>{@link #getEnd()}<code>)</code>. 1009 * This is equivalent to <code>source.</code>{@link Source#parseAttributes(int,int) parseAttributes}<code>(</code>{@link #getBegin()}<code>,</code>{@link #getEnd()}<code>)</code>.
868 * 1010 *
869 * @return the {@link Attributes} within this segment, or <code>null</code> if too many errors occur while parsing. 1011 * @return the {@link Attributes} within this segment, or <code>null</code> if too many errors occur while parsing.
870 */ 1012 */
871 public Attributes parseAttributes() { 1013 public Attributes parseAttributes()
1014 {
872 return source.parseAttributes(begin,end); 1015 return source.parseAttributes(begin, end);
873 } 1016 }
874 1017
875 /** 1018 /**
876 * Causes the this segment to be ignored when parsing. 1019 * Causes the this segment to be ignored when parsing.
877 * <p> 1020 * <p>
878 * Ignored segments are treated as blank spaces by the parsing mechanism, but are included as normal text in all other functions. 1021 * Ignored segments are treated as blank spaces by the parsing mechanism, but are included as normal text in all other functions.
879 * <p> 1022 * <p>
880 * This method was originally the only means of preventing {@linkplain TagType#isServerTag() server tags} located inside 1023 * This method was originally the only means of preventing {@linkplain TagType#isServerTag() server tags} located inside {@linkplain StartTagType#NORMAL normal} tags from interfering with the parsing of the tags (such as where an {@linkplain Attribute attribute} of a normal tag uses a server tag to dynamically set its value), as well as preventing non-server tags from being recognised inside server tags.
881 * {@linkplain StartTagType#NORMAL normal} tags from interfering with the parsing of the tags
882 * (such as where an {@linkplain Attribute attribute} of a normal tag uses a server tag to dynamically set its value),
883 * as well as preventing non-server tags from being recognised inside server tags.
884 * <p> 1024 * <p>
885 * It is not necessary to use this method to ignore {@linkplain TagType#isServerTag() server tags} located inside normal tags, 1025 * It is not necessary to use this method to ignore {@linkplain TagType#isServerTag() server tags} located inside normal tags, as the attributes parser automatically ignores any server tags.
886 * as the attributes parser automatically ignores any server tags.
887 * <p> 1026 * <p>
888 * It is not necessary to use this method to ignore non-server tags inside server tags, or the contents of {@link HTMLElementName#SCRIPT SCRIPT} elements, 1027 * It is not necessary to use this method to ignore non-server tags inside server tags, or the contents of {@link HTMLElementName#SCRIPT SCRIPT} elements, as the parser does this automatically when performing a {@linkplain Source#fullSequentialParse() full sequential parse}.
889 * as the parser does this automatically when performing a {@linkplain Source#fullSequentialParse() full sequential parse}.
890 * <p> 1028 * <p>
891 * This leaves only very few scenarios where calling this method still provides a significant benefit. 1029 * This leaves only very few scenarios where calling this method still provides a significant benefit.
892 * <p> 1030 * <p>
893 * One such case is where XML-style server tags are used inside {@linkplain StartTagType#NORMAL normal} tags. 1031 * One such case is where XML-style server tags are used inside {@linkplain StartTagType#NORMAL normal} tags. Here is an example using an XML-style JSP tag: <blockquote class="code"><code>&lt;a href="&lt;i18n:resource path="/Portal"/&gt;?BACK=TRUE"&gt;back&lt;/a&gt;</code></blockquote> The first double-quote of <code>"/Portal"</code> will be interpreted as the end quote for the <code>href</code> attribute, as there is no way for the parser to recognise the <code>il8n:resource</code> element
894 * Here is an example using an XML-style JSP tag: 1032 * as a server tag. Such use of XML-style server tags inside {@linkplain StartTagType#NORMAL normal} tags is generally seen as bad practice, but it is nevertheless valid JSP. The only way to ensure that this library is able to parse the normal tag surrounding it is to find these server tags first and call the <code>ignoreWhenParsing</code> method to ignore them before parsing the rest of the document.
895 * <blockquote class="code"><code>&lt;a href="&lt;i18n:resource path="/Portal"/&gt;?BACK=TRUE"&gt;back&lt;/a&gt;</code></blockquote>
896 * The first double-quote of <code>"/Portal"</code> will be interpreted as the end quote for the <code>href</code> attribute,
897 * as there is no way for the parser to recognise the <code>il8n:resource</code> element as a server tag.
898 * Such use of XML-style server tags inside {@linkplain StartTagType#NORMAL normal} tags is generally seen as bad practice,
899 * but it is nevertheless valid JSP. The only way to ensure that this library is able to parse the normal tag surrounding it is to
900 * find these server tags first and call the <code>ignoreWhenParsing</code> method to ignore them before parsing the rest of the document.
901 * <p> 1033 * <p>
902 * It is important to understand the difference between ignoring the segment when parsing and removing the segment completely. 1034 * It is important to understand the difference between ignoring the segment when parsing and removing the segment completely. Any text inside a segment that is ignored when parsing is treated by most functions as content, and as such is included in the output of tools such as {@link TextExtractor} and {@link Renderer}.
903 * Any text inside a segment that is ignored when parsing is treated by most functions as content, and as such is included in the output of
904 * tools such as {@link TextExtractor} and {@link Renderer}.
905 * <p> 1035 * <p>
906 * To remove segments completely, create an {@link OutputDocument} and call its {@link OutputDocument#remove(Segment) remove(Segment)} or 1036 * To remove segments completely, create an {@link OutputDocument} and call its {@link OutputDocument#remove(Segment) remove(Segment)} or {@link OutputDocument#replaceWithSpaces(int,int) replaceWithSpaces(int begin, int end)} method for each segment. Then create a new source document using {@link Source#Source(CharSequence) new Source(outputDocument.toString())} and perform the desired operations on this new source object.
907 * {@link OutputDocument#replaceWithSpaces(int,int) replaceWithSpaces(int begin, int end)} method for each segment.
908 * Then create a new source document using {@link Source#Source(CharSequence) new Source(outputDocument.toString())}
909 * and perform the desired operations on this new source object.
910 * <p> 1037 * <p>
911 * Calling this method after the {@link Source#fullSequentialParse()} method has been called is not permitted and throws an <code>IllegalStateException</code>. 1038 * Calling this method after the {@link Source#fullSequentialParse()} method has been called is not permitted and throws an <code>IllegalStateException</code>.
912 * <p> 1039 * <p>
913 * Any tags appearing in this segment that are found before this method is called will remain in the {@linkplain Source#getCacheDebugInfo() tag cache}, 1040 * Any tags appearing in this segment that are found before this method is called will remain in the {@linkplain Source#getCacheDebugInfo() tag cache}, and so will continue to be found by the <a href="Tag.html#TagSearchMethods">tag search methods</a>. If this is undesirable, the {@link Source#clearCache()} method can be called to remove them from the cache. Calling the {@link Source#fullSequentialParse()} method after this method clears the cache automatically.
914 * and so will continue to be found by the <a href="Tag.html#TagSearchMethods">tag search methods</a>.
915 * If this is undesirable, the {@link Source#clearCache()} method can be called to remove them from the cache.
916 * Calling the {@link Source#fullSequentialParse()} method after this method clears the cache automatically.
917 * <p> 1041 * <p>
918 * For best performance, this method should be called on all segments that need to be ignored without calling 1042 * For best performance, this method should be called on all segments that need to be ignored without calling any of the <a href="Tag.html#TagSearchMethods">tag search methods</a> in between.
919 * any of the <a href="Tag.html#TagSearchMethods">tag search methods</a> in between.
920 * 1043 *
921 * @see Source#ignoreWhenParsing(Collection segments) 1044 * @see Source#ignoreWhenParsing(Collection segments)
922 */ 1045 */
923 public void ignoreWhenParsing() { 1046 public void ignoreWhenParsing()
1047 {
924 source.ignoreWhenParsing(begin,end); 1048 source.ignoreWhenParsing(begin, end);
925 } 1049 }
926 1050
927 /** 1051 /**
928 * Compares this <code>Segment</code> object to another object. 1052 * Compares this <code>Segment</code> object to another object.
929 * <p> 1053 * <p>
930 * If the argument is not a <code>Segment</code>, a <code>ClassCastException</code> is thrown. 1054 * If the argument is not a <code>Segment</code>, a <code>ClassCastException</code> is thrown.
931 * <p> 1055 * <p>
932 * A segment is considered to be before another segment if its begin position is earlier, 1056 * A segment is considered to be before another segment if its begin position is earlier, or in the case that both segments begin at the same position, its end position is earlier.
933 * or in the case that both segments begin at the same position, its end position is earlier.
934 * <p> 1057 * <p>
935 * Segments that begin and end at the same position are considered equal for 1058 * Segments that begin and end at the same position are considered equal for the purposes of this comparison, even if they relate to different source documents.
936 * the purposes of this comparison, even if they relate to different source documents.
937 * <p> 1059 * <p>
938 * Note: this class has a natural ordering that is inconsistent with equals. 1060 * Note: this class has a natural ordering that is inconsistent with equals. This means that this method may return zero in some cases where calling the {@link #equals(Object)} method with the same argument returns <code>false</code>.
939 * This means that this method may return zero in some cases where calling the
940 * {@link #equals(Object)} method with the same argument returns <code>false</code>.
941 * 1061 *
942 * @param segment the segment to be compared 1062 * @param segment
1063 * the segment to be compared
943 * @return a negative integer, zero, or a positive integer as this segment is before, equal to, or after the specified segment. 1064 * @return a negative integer, zero, or a positive integer as this segment is before, equal to, or after the specified segment.
944 * @throws ClassCastException if the argument is not a <code>Segment</code> 1065 * @throws ClassCastException
1066 * if the argument is not a <code>Segment</code>
945 */ 1067 */
946 public int compareTo(final Segment segment) { 1068 public int compareTo(final Segment segment)
1069 {
947 if (this==segment) return 0; 1070 if (this == segment) return 0;
948 if (begin<segment.begin) return -1; 1071 if (begin < segment.begin) return -1;
949 if (begin>segment.begin) return 1; 1072 if (begin > segment.begin) return 1;
950 if (end<segment.end) return -1; 1073 if (end < segment.end) return -1;
951 if (end>segment.end) return 1; 1074 if (end > segment.end) return 1;
952 return 0; 1075 return 0;
953 } 1076 }
954 1077
955 /** 1078 /**
956 * Indicates whether this segment consists entirely of {@linkplain #isWhiteSpace(char) white space}. 1079 * Indicates whether this segment consists entirely of {@linkplain #isWhiteSpace(char) white space}.
1080 *
957 * @return <code>true</code> if this segment consists entirely of {@linkplain #isWhiteSpace(char) white space}, otherwise <code>false</code>. 1081 * @return <code>true</code> if this segment consists entirely of {@linkplain #isWhiteSpace(char) white space}, otherwise <code>false</code>.
958 */ 1082 */
959 public final boolean isWhiteSpace() { 1083 public final boolean isWhiteSpace()
1084 {
960 for (int i=begin; i<end; i++) 1085 for (int i = begin; i < end; i++)
961 if (!isWhiteSpace(source.charAt(i))) return false; 1086 if (!isWhiteSpace(source.charAt(i))) return false;
962 return true; 1087 return true;
963 } 1088 }
964 1089
965 /** 1090 /**
966 * Returns an indication of the maximum depth of nested elements within this segment. 1091 * Returns an indication of the maximum depth of nested elements within this segment.
967 * <p> 1092 * <p>
968 * A high return value can indicate that the segment contains a large number of incorrectly nested tags that could result in a <code>StackOverflowException</code> 1093 * A high return value can indicate that the segment contains a large number of incorrectly nested tags that could result in a <code>StackOverflowException</code> if its content is parsed.
969 * if its content is parsed.
970 * <p> 1094 * <p>
971 * The usefulness of this method is debatable as a <code>StackOverflowException</code> is a recoverable error that can be easily caught. 1095 * The usefulness of this method is debatable as a <code>StackOverflowException</code> is a recoverable error that can be easily caught. The use of this method to pre-detect and avoid a stack overflow may save some memory and processing resources in certain circumstances, but the cost of calling this method to check every segment or document will very often exceed any benefit.
972 * The use of this method to pre-detect and avoid a stack overflow may save some memory and processing resources in certain circumstances, but the cost of calling
973 * this method to check every segment or document will very often exceed any benefit.
974 * <p> 1096 * <p>
975 * It is up to the application developer to determine what return value constitutes an unreasonable level of nesting given the stack space allocated to the application 1097 * It is up to the application developer to determine what return value constitutes an unreasonable level of nesting given the stack space allocated to the application and other factors.
976 * and other factors.
977 * <p> 1098 * <p>
978 * Note that the return value is an approximation only and is usually greater than the actual maximum element depth that would be reported by calling the 1099 * Note that the return value is an approximation only and is usually greater than the actual maximum element depth that would be reported by calling the {@link Element#getDepth()} method on the most nested element.
979 * {@link Element#getDepth()} method on the most nested element.
980 * 1100 *
981 * @return an indication of the maximum depth of nested elements within this segment. 1101 * @return an indication of the maximum depth of nested elements within this segment.
982 */ 1102 */
983 public int getMaxDepthIndicator() { 1103 public int getMaxDepthIndicator()
1104 {
984 int maxDepth=0; 1105 int maxDepth = 0;
985 int depth=0; 1106 int depth = 0;
986 for (Tag tag : getAllTags()) { 1107 for (Tag tag : getAllTags())
1108 {
987 if (tag instanceof StartTag) { 1109 if (tag instanceof StartTag)
1110 {
988 StartTag startTag=(StartTag)tag; 1111 StartTag startTag = (StartTag) tag;
989 if (startTag.getStartTagType().getCorrespondingEndTagType()==null) continue; 1112 if (startTag.getStartTagType().getCorrespondingEndTagType() == null) continue;
990 if (HTMLElements.getEndTagForbiddenElementNames().contains(startTag.getName())) continue; 1113 if (HTMLElements.getEndTagForbiddenElementNames().contains(startTag.getName())) continue;
991 if (startTag.isEmptyElementTag()) continue; 1114 if (startTag.isEmptyElementTag()) continue;
992 depth++; 1115 depth++;
993 if (depth>maxDepth) maxDepth++; 1116 if (depth > maxDepth) maxDepth++;
1117 }
994 } else { 1118 else
1119 {
995 depth--; 1120 depth--;
996 } 1121 }
997 } 1122 }
998 return maxDepth; 1123 return maxDepth;
999 } 1124 }
1000 1125
1001 /** 1126 /**
1002 * Indicates whether the specified character is <a target="_blank" href="http://www.w3.org/TR/html401/struct/text.html#h-9.1">white space</a>. 1127 * Indicates whether the specified character is <a target="_blank" href="http://www.w3.org/TR/html401/struct/text.html#h-9.1">white space</a>.
1003 * <p> 1128 * <p>
1004 * The <a target="_blank" href="http://www.w3.org/TR/html401/struct/text.html#h-9.1">HTML 4.01 specification section 9.1</a> 1129 * The <a target="_blank" href="http://www.w3.org/TR/html401/struct/text.html#h-9.1">HTML 4.01 specification section 9.1</a> specifies the following white space characters:
1005 * specifies the following white space characters:
1006 * <ul> 1130 * <ul>
1007 * <li>space (U+0020) 1131 * <li>space (U+0020)
1008 * <li>tab (U+0009) 1132 * <li>tab (U+0009)
1009 * <li>form feed (U+000C) 1133 * <li>form feed (U+000C)
1010 * <li>line feed (U+000A) 1134 * <li>line feed (U+000A)
1011 * <li>carriage return (U+000D) 1135 * <li>carriage return (U+000D)
1012 * <li>zero-width space (U+200B) 1136 * <li>zero-width space (U+200B)
1013 * </ul> 1137 * </ul>
1014 * <p> 1138 * <p>
1015 * Despite the explicit inclusion of the zero-width space in the HTML specification, Microsoft IE6 does not 1139 * Despite the explicit inclusion of the zero-width space in the HTML specification, Microsoft IE6 does not recognise them as white space and renders them as an unprintable character (empty square). Even zero-width spaces included using the numeric character reference <code>&amp;#x200B;</code> are rendered this way.
1016 * recognise them as white space and renders them as an unprintable character (empty square).
1017 * Even zero-width spaces included using the numeric character reference <code>&amp;#x200B;</code> are rendered this way.
1018 * 1140 *
1141 * @param ch
1019 * @param ch the character to test. 1142 * the character to test.
1020 * @return <code>true</code> if the specified character is <a target="_blank" href="http://www.w3.org/TR/html401/struct/text.html#h-9.1">white space</a>, otherwise <code>false</code>. 1143 * @return <code>true</code> if the specified character is <a target="_blank" href="http://www.w3.org/TR/html401/struct/text.html#h-9.1">white space</a>, otherwise <code>false</code>.
1021 */ 1144 */
1022 public static final boolean isWhiteSpace(final char ch) { 1145 public static final boolean isWhiteSpace(final char ch)
1023 for (char whiteSpaceChar : WHITESPACE) if (ch==whiteSpaceChar) return true; 1146 {
1147 for (char whiteSpaceChar : WHITESPACE)
1148 if (ch == whiteSpaceChar) return true;
1024 return false; 1149 return false;
1025 } 1150 }
1026 1151
1027 /** 1152 /**
1028 * Returns a string representation of this object useful for debugging purposes. 1153 * Returns a string representation of this object useful for debugging purposes.
1154 *
1029 * @return a string representation of this object useful for debugging purposes. 1155 * @return a string representation of this object useful for debugging purposes.
1030 */ 1156 */
1031 public String getDebugInfo() { 1157 public String getDebugInfo()
1158 {
1032 final StringBuilder sb=new StringBuilder(50); 1159 final StringBuilder sb = new StringBuilder(50);
1033 sb.append('('); 1160 sb.append('(');
1034 source.getRowColumnVector(begin).appendTo(sb); 1161 source.getRowColumnVector(begin).appendTo(sb);
1035 sb.append('-'); 1162 sb.append('-');
1036 source.getRowColumnVector(end).appendTo(sb); 1163 source.getRowColumnVector(end).appendTo(sb);
1037 sb.append(')'); 1164 sb.append(')');
1039 } 1166 }
1040 1167
1041 /** 1168 /**
1042 * Returns the character at the specified index. 1169 * Returns the character at the specified index.
1043 * <p> 1170 * <p>
1044 * This is logically equivalent to <code>toString().charAt(index)</code> 1171 * This is logically equivalent to <code>toString().charAt(index)</code> for valid argument values <code>0 <= index < length()</code>.
1045 * for valid argument values <code>0 <= index < length()</code>.
1046 * <p> 1172 * <p>
1047 * However because this implementation works directly on the underlying document source string, 1173 * However because this implementation works directly on the underlying document source string, it should not be assumed that an <code>IndexOutOfBoundsException</code> is thrown for an invalid argument value.
1048 * it should not be assumed that an <code>IndexOutOfBoundsException</code> is thrown
1049 * for an invalid argument value.
1050 * 1174 *
1051 * @param index the index of the character. 1175 * @param index
1176 * the index of the character.
1052 * @return the character at the specified index. 1177 * @return the character at the specified index.
1053 */ 1178 */
1054 public char charAt(final int index) { 1179 public char charAt(final int index)
1180 {
1055 return source.charAt(begin+index); 1181 return source.charAt(begin + index);
1056 } 1182 }
1057 1183
1058 /** 1184 /**
1059 * Returns a new character sequence that is a subsequence of this sequence. 1185 * Returns a new character sequence that is a subsequence of this sequence.
1060 * <p> 1186 * <p>
1061 * This is logically equivalent to <code>toString().subSequence(beginIndex,endIndex)</code> 1187 * This is logically equivalent to <code>toString().subSequence(beginIndex,endIndex)</code> for valid values of <code>beginIndex</code> and <code>endIndex</code>.
1062 * for valid values of <code>beginIndex</code> and <code>endIndex</code>.
1063 * <p> 1188 * <p>
1064 * However because this implementation works directly on the underlying document source text, 1189 * However because this implementation works directly on the underlying document source text, it should not be assumed that an <code>IndexOutOfBoundsException</code> is thrown for invalid argument values as described in the <code>String.subSequence(int,int)</code> method.
1065 * it should not be assumed that an <code>IndexOutOfBoundsException</code> is thrown
1066 * for invalid argument values as described in the <code>String.subSequence(int,int)</code> method.
1067 * 1190 *
1068 * @param beginIndex the begin index, inclusive. 1191 * @param beginIndex
1069 * @param endIndex the end index, exclusive. 1192 * the begin index, inclusive.
1193 * @param endIndex
1194 * the end index, exclusive.
1070 * @return a new character sequence that is a subsequence of this sequence. 1195 * @return a new character sequence that is a subsequence of this sequence.
1071 */ 1196 */
1072 public CharSequence subSequence(final int beginIndex, final int endIndex) { 1197 public CharSequence subSequence(final int beginIndex, final int endIndex)
1198 {
1073 return source.subSequence(begin+beginIndex,begin+endIndex); 1199 return source.subSequence(begin + beginIndex, begin + endIndex);
1074 } 1200 }
1075 1201
1076 /** 1202 /**
1077 * Collapses the {@linkplain #isWhiteSpace(char) white space} in the specified text. 1203 * Collapses the {@linkplain #isWhiteSpace(char) white space} in the specified text.
1078 * All leading and trailing white space is omitted, and any sections of internal white space are replaced by a single space. 1204 * All leading and trailing white space is omitted, and any sections of internal white space are replaced by a single space.
1079 */ 1205 */
1080 static final StringBuilder appendCollapseWhiteSpace(final StringBuilder sb, final CharSequence text) { 1206 static final StringBuilder appendCollapseWhiteSpace(final StringBuilder sb, final CharSequence text)
1207 {
1081 final int textLength=text.length(); 1208 final int textLength = text.length();
1082 int i=0; 1209 int i = 0;
1083 boolean lastWasWhiteSpace=false; 1210 boolean lastWasWhiteSpace = false;
1084 while (true) { 1211 while (true)
1212 {
1085 if (i>=textLength) return sb; 1213 if (i >= textLength) return sb;
1086 if (!isWhiteSpace(text.charAt(i))) break; 1214 if (!isWhiteSpace(text.charAt(i))) break;
1087 i++; 1215 i++;
1088 } 1216 }
1089 do { 1217 do
1218 {
1090 final char ch=text.charAt(i++); 1219 final char ch = text.charAt(i++);
1091 if (isWhiteSpace(ch)) { 1220 if (isWhiteSpace(ch))
1221 {
1092 lastWasWhiteSpace=true; 1222 lastWasWhiteSpace = true;
1223 }
1093 } else { 1224 else
1225 {
1094 if (lastWasWhiteSpace) { 1226 if (lastWasWhiteSpace)
1227 {
1095 sb.append(' '); 1228 sb.append(' ');
1096 lastWasWhiteSpace=false; 1229 lastWasWhiteSpace = false;
1097 } 1230 }
1098 sb.append(ch); 1231 sb.append(ch);
1099 } 1232 }
1233 }
1100 } while (i<textLength); 1234 while (i < textLength);
1101 return sb; 1235 return sb;
1102 } 1236 }
1103 1237
1104 static final Pattern getClassPattern(final String className) { 1238 static final Pattern getClassPattern(final String className)
1239 {
1105 return Pattern.compile(".*(\\s|^)"+className+"(\\s|$).*",Pattern.DOTALL); 1240 return Pattern.compile(".*(\\s|^)" + className + "(\\s|$).*", Pattern.DOTALL);
1106 } 1241 }
1107 1242
1108 private List<Element> getAllElements(final List<StartTag> startTags) { 1243 private List<Element> getAllElements(final List<StartTag> startTags)
1244 {
1109 if (startTags.isEmpty()) return Collections.emptyList(); 1245 if (startTags.isEmpty()) return Collections.emptyList();
1110 final ArrayList<Element> elements=new ArrayList<Element>(startTags.size()); 1246 final ArrayList<Element> elements = new ArrayList<Element>(startTags.size());
1111 for (StartTag startTag : startTags) { 1247 for (StartTag startTag : startTags)
1248 {
1112 final Element element=startTag.getElement(); 1249 final Element element = startTag.getElement();
1113 if (element.end<=end) elements.add(element); 1250 if (element.end <= end) elements.add(element);
1114 } 1251 }
1115 return elements; 1252 return elements;
1116 } 1253 }
1117 1254
1118 private StartTag checkEnclosure(final StartTag startTag) { 1255 private StartTag checkEnclosure(final StartTag startTag)
1256 {
1119 if (startTag==null || startTag.end>end) return null; 1257 if (startTag == null || startTag.end > end) return null;
1120 return startTag; 1258 return startTag;
1121 } 1259 }
1122 1260
1123 private Tag checkTagEnclosure(final Tag tag) { 1261 private Tag checkTagEnclosure(final Tag tag)
1262 {
1124 if (tag==null || tag.end>end) return null; 1263 if (tag == null || tag.end > end) return null;
1125 return tag; 1264 return tag;
1126 } 1265 }
1127 1266
1128 private CharacterReference getNextCharacterReference(final int pos) { 1267 private CharacterReference getNextCharacterReference(final int pos)
1268 {
1129 final CharacterReference characterReference=source.getNextCharacterReference(pos); 1269 final CharacterReference characterReference = source.getNextCharacterReference(pos);
1130 if (characterReference==null || characterReference.end>end) return null; 1270 if (characterReference == null || characterReference.end > end) return null;
1131 return characterReference; 1271 return characterReference;
1132 } 1272 }
1133} 1273}
1134

Legend:
Removed from v.3  
changed lines
  Added in v.4

   
Visit the aagtl Website