/[aagtl_public1]/src/net/htmlparser/jericho/Segment.java
aagtl

Contents of /src/net/htmlparser/jericho/Segment.java

Parent Directory Parent Directory | Revision Log Revision Log


Revision 2 - (show annotations) (download)
Sun Aug 5 13:48:36 2012 UTC (11 years, 8 months ago) by zoffadmin
File size: 59598 byte(s)
initial import of aagtl source code
1 // Jericho HTML Parser - Java based library for analysing and manipulating HTML
2 // Version 3.2
3 // Copyright (C) 2004-2009 Martin Jericho
4 // http://jericho.htmlparser.net/
5 //
6 // This library is free software; you can redistribute it and/or
7 // modify it under the terms of either one of the following licences:
8 //
9 // 1. The Eclipse Public License (EPL) version 1.0,
10 // included in this distribution in the file licence-epl-1.0.html
11 // or available at http://www.eclipse.org/legal/epl-v10.html
12 //
13 // 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
14 // included in this distribution in the file licence-lgpl-2.1.txt
15 // or available at http://www.gnu.org/licenses/lgpl.txt
16 //
17 // This library is distributed on an "AS IS" basis,
18 // WITHOUT WARRANTY OF ANY KIND, either express or implied.
19 // See the individual licence texts for more details.
20
21 package net.htmlparser.jericho;
22
23 import java.util.Iterator;
24 import java.util.List;
25 import java.util.Collections;
26 import java.util.ArrayList;
27 import java.util.regex.Pattern;
28
29 /**
30 * Represents a segment of a {@link Source} document.
31 * <p>
32 * Many of the <a href="Tag.html#TagSearchMethods">tag search methods</a> are defined in this class.
33 * <p>
34 * The <i>span</i> of a segment is defined by the combination of its begin and end character positions.
35 */
36 public class Segment implements Comparable<Segment>, CharSequence {
37 final int begin;
38 final int end;
39 final Source source;
40
41 private static final char[] WHITESPACE={' ','\n','\r','\t','\f','\u200B'}; // see comments in isWhiteSpace(char) method
42
43 /**
44 * Constructs a new <code>Segment</code> within the specified {@linkplain Source source} document with the specified begin and end character positions.
45 * @param source the {@link Source} document, must not be <code>null</code>.
46 * @param begin the character position in the source where this segment {@linkplain #getBegin() begins}, inclusive.
47 * @param end the character position in the source where this segment {@linkplain #getEnd() ends}, exclusive.
48 */
49 public Segment(final Source source, final int begin, final int end) {
50 if (begin==-1 || end==-1 || begin>end) throw new IllegalArgumentException();
51 this.begin=begin;
52 this.end=end;
53 if (source==null) throw new IllegalArgumentException("source argument must not be null");
54 this.source=source;
55 }
56
57 // Only called from Source constructor
58 Segment(final int length) {
59 begin=0;
60 this.end=length;
61 source=(Source)this;
62 }
63
64 // Only used for creating dummy flag instances of this type (see Tag.NOT_CACHED and Element.NOT_CACHED)
65 Segment() {
66 this(0,0);
67 }
68
69 // Only used for creating dummy flag instances of this type (see Segment() constructor and StreamedSource.START_SEGMENT)
70 Segment(final int begin, final int end) {
71 this.begin=begin;
72 this.end=end;
73 source=null;
74 }
75
76 /**
77 * Returns the {@link Source} document containing this segment.
78 * <p>
79 * If a {@link StreamedSource} is in use, this method throws an <code>UnsupportedOperationException</code>.
80 *
81 * @return the {@link Source} document containing this segment.
82 */
83 public final Source getSource() {
84 if (source.isStreamed()) throw new UnsupportedOperationException("Source object is not available when using StreamedSource");
85 return source;
86 }
87
88 /**
89 * Returns the character position in the {@link Source} document at which this segment begins, inclusive.
90 * <p>
91 * Use the {@link Source#getRowColumnVector(int pos)} method to determine the row and column numbers corresponding to this character position.
92 *
93 * @return the character position in the {@link Source} document at which this segment begins, inclusive.
94 */
95 public final int getBegin() {
96 return begin;
97 }
98
99 /**
100 * Returns the character position in the {@link Source} document immediately after the end of this segment.
101 * <p>
102 * The character at the position specified by this property is <b>not</b> included in the segment.
103 *
104 * @return the character position in the {@link Source} document immediately after the end of this segment.
105 * @see #getBegin()
106 */
107 public final int getEnd() {
108 return end;
109 }
110
111 /**
112 * Compares the specified object with this <code>Segment</code> for equality.
113 * <p>
114 * Returns <code>true</code> if and only if the specified object is also a <code>Segment</code>,
115 * and both segments have the same {@link Source}, and the same begin and end positions.
116 * @param object the object to be compared for equality with this <code>Segment</code>.
117 * @return <code>true</code> if the specified object is equal to this <code>Segment</code>, otherwise <code>false</code>.
118 */
119 public final boolean equals(final Object object) {
120 if (this==object) return true;
121 if (object==null || !(object instanceof Segment)) return false;
122 final Segment segment=(Segment)object;
123 return segment.begin==begin && segment.end==end && segment.source==source;
124 }
125
126 /**
127 * Returns a hash code value for the segment.
128 * <p>
129 * The current implementation returns the sum of the begin and end positions, although this is not
130 * guaranteed in future versions.
131 *
132 * @return a hash code value for the segment.
133 */
134 public int hashCode() {
135 return begin+end;
136 }
137
138 /**
139 * Returns the length of the segment.
140 * This is defined as the number of characters between the begin and end positions.
141 * @return the length of the segment.
142 */
143 public int length() {
144 return end-begin;
145 }
146
147 /**
148 * Indicates whether this <code>Segment</code> encloses the specified <code>Segment</code>.
149 * <p>
150 * This is the case if {@link #getBegin()}<code>&lt;=segment.</code>{@link #getBegin()}<code> &amp;&amp; </code>{@link #getEnd()}<code>&gt;=segment.</code>{@link #getEnd()}.
151 * <p>
152 * Note that a segment encloses itself.
153 *
154 * @param segment the segment to be tested for being enclosed by this segment.
155 * @return <code>true</code> if this <code>Segment</code> encloses the specified <code>Segment</code>, otherwise <code>false</code>.
156 */
157 public final boolean encloses(final Segment segment) {
158 return begin<=segment.begin && end>=segment.end;
159 }
160
161 /**
162 * Indicates whether this segment encloses the specified character position in the source document.
163 * <p>
164 * This is the case if {@link #getBegin()}<code> &lt;= pos &lt; </code>{@link #getEnd()}.
165 *
166 * @param pos the position in the {@link Source} document.
167 * @return <code>true</code> if this segment encloses the specified character position in the source document, otherwise <code>false</code>.
168 */
169 public final boolean encloses(final int pos) {
170 return begin<=pos && pos<end;
171 }
172
173 /**
174 * Returns the source text of this segment as a <code>String</code>.
175 * <p>
176 * The returned <code>String</code> is newly created with every call to this method, unless this
177 * segment is itself an instance of {@link Source}.
178 *
179 * @return the source text of this segment as a <code>String</code>.
180 */
181 public String toString() {
182 return source.subSequence(begin,end).toString();
183 }
184
185 /**
186 * Performs a simple rendering of the HTML markup in this segment into text.
187 * <p>
188 * The output can be configured by setting any number of properties on the returned {@link Renderer} instance before
189 * {@linkplain Renderer#writeTo(Writer) obtaining its output}.
190 *
191 * @return an instance of {@link Renderer} based on this segment.
192 * @see #getTextExtractor()
193 */
194 public Renderer getRenderer() {
195 return new Renderer(this);
196 }
197
198 /**
199 * Extracts the textual content from the HTML markup of this segment.
200 * <p>
201 * The output can be configured by setting properties on the returned {@link TextExtractor} instance before
202 * {@linkplain TextExtractor#writeTo(Writer) obtaining its output}.
203 * <p>
204 * @return an instance of {@link TextExtractor} based on this segment.
205 * @see #getRenderer()
206 */
207 public TextExtractor getTextExtractor() {
208 return new TextExtractor(this);
209 }
210
211 /**
212 * Returns an iterator over every {@linkplain Tag tag}, {@linkplain CharacterReference character reference} and <a href="Source.html#PlainText">plain text</a> segment contained within this segment.
213 * <p>
214 * See the {@link Source#iterator()} method for a detailed description.
215 * <p>
216 * <dl>
217 * <dt>Example:</dt>
218 * <dd>
219 * <p>
220 * The following code demonstrates the typical usage of this method to make an exact copy of this segment to <code>writer</code> (assuming no server tags are present):
221 * </p>
222 * <pre>
223 * for (Iterator&lt;Segment&gt; nodeIterator=segment.getNoteIterator(); nodeIterator.hasNext();) {
224 * Segment nodeSegment=nodeIterator.next();
225 * if (nodeSegment instanceof Tag) {
226 * Tag tag=(Tag)nodeSegment;
227 * // HANDLE TAG
228 * // Uncomment the following line to ensure each tag is valid XML:
229 * // writer.write(tag.tidy()); continue;
230 * } else if (nodeSegment instanceof CharacterReference) {
231 * CharacterReference characterReference=(CharacterReference)nodeSegment;
232 * // HANDLE CHARACTER REFERENCE
233 * // Uncomment the following line to decode all character references instead of copying them verbatim:
234 * // characterReference.appendCharTo(writer); continue;
235 * } else {
236 * // HANDLE PLAIN TEXT
237 * }
238 * // unless specific handling has prevented getting to here, simply output the segment as is:
239 * writer.write(nodeSegment.toString());
240 * }</pre>
241 * </dd>
242 * </dl>
243 * @return an iterator over every {@linkplain Tag tag}, {@linkplain CharacterReference character reference} and <a href="Source.html#PlainText">plain text</a> segment contained within this segment.
244 */
245 public Iterator<Segment> getNodeIterator() {
246 return new NodeIterator(this);
247 }
248
249 /**
250 * Returns a list of all {@link Tag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
251 * <p>
252 * The {@link Source#fullSequentialParse()} method should be called after construction of the {@link Source} object
253 * if this method is to be used on a large proportion of the source.
254 * It is called automatically if this method is called on the {@link Source} object itself.
255 * <p>
256 * See the {@link Tag} class documentation for more details about the behaviour of this method.
257 *
258 * @return a list of all {@link Tag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
259 */
260 public List<Tag> getAllTags() {
261 return getAllTags(null);
262 }
263
264 /**
265 * Returns a list of all {@link Tag} objects of the specified {@linkplain TagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
266 * <p>
267 * See the {@link Tag} class documentation for more details about the behaviour of this method.
268 * <p>
269 * Specifying a <code>null</code> argument to the <code>tagType</code> parameter is equivalent to {@link #getAllTags()}.
270 *
271 * @param tagType the {@linkplain TagType type} of tags to get.
272 * @return a list of all {@link Tag} objects of the specified {@linkplain TagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
273 * @see #getAllStartTags(StartTagType)
274 */
275 public List<Tag> getAllTags(final TagType tagType) {
276 Tag tag=checkTagEnclosure(Tag.getNextTag(source,begin,tagType));
277 if (tag==null) return Collections.emptyList();
278 final ArrayList<Tag> list=new ArrayList<Tag>();
279 do {
280 list.add(tag);
281 tag=checkTagEnclosure(tag.getNextTag(tagType));
282 } while (tag!=null);
283 return list;
284 }
285
286 /**
287 * Returns a list of all {@link StartTag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
288 * <p>
289 * The {@link Source#fullSequentialParse()} method should be called after construction of the {@link Source} object
290 * if this method is to be used on a large proportion of the source.
291 * It is called automatically if this method is called on the {@link Source} object itself.
292 * <p>
293 * See the {@link Tag} class documentation for more details about the behaviour of this method.
294 *
295 * @return a list of all {@link StartTag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
296 */
297 public List<StartTag> getAllStartTags() {
298 StartTag startTag=checkEnclosure(StartTag.getNext(source,begin));
299 if (startTag==null) return Collections.emptyList();
300 final ArrayList<StartTag> list=new ArrayList<StartTag>();
301 do {
302 list.add(startTag);
303 startTag=checkEnclosure(startTag.getNextStartTag());
304 } while (startTag!=null);
305 return list;
306 }
307
308 /**
309 * Returns a list of all {@link StartTag} objects of the specified {@linkplain StartTagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
310 * <p>
311 * See the {@link Tag} class documentation for more details about the behaviour of this method.
312 * <p>
313 * Specifying a <code>null</code> argument to the <code>startTagType</code> parameter is equivalent to {@link #getAllStartTags()}.
314 *
315 * @param startTagType the {@linkplain StartTagType type} of tags to get.
316 * @return a list of all {@link StartTag} objects of the specified {@linkplain StartTagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
317 */
318 public List<StartTag> getAllStartTags(final StartTagType startTagType) {
319 if (startTagType==null) return getAllStartTags();
320 StartTag startTag=(StartTag)checkTagEnclosure(Tag.getNextTag(source,begin,startTagType));
321 if (startTag==null) return Collections.emptyList();
322 final ArrayList<StartTag> list=new ArrayList<StartTag>();
323 do {
324 list.add(startTag);
325 startTag=(StartTag)checkTagEnclosure(startTag.getNextTag(startTagType));
326 } while (startTag!=null);
327 return list;
328 }
329
330 /**
331 * Returns a list of all {@linkplain StartTagType#NORMAL normal} {@link StartTag} objects with the specified {@linkplain StartTag#getName() name} that are {@linkplain #encloses(Segment) enclosed} by this segment.
332 * <p>
333 * See the {@link Tag} class documentation for more details about the behaviour of this method.
334 * <p>
335 * Specifying a <code>null</code> argument to the <code>name</code> parameter is equivalent to {@link #getAllStartTags()}, which may include non-{@linkplain StartTagType#NORMAL normal} start tags.
336 * <p>
337 * This method also returns {@linkplain Tag#isUnregistered() unregistered} tags if the specified name is not a valid {@linkplain Tag#isXMLName(CharSequence) XML tag name}.
338 *
339 * @param name the {@linkplain StartTag#getName() name} of the start tags to get.
340 * @return a list of all {@linkplain StartTagType#NORMAL normal} {@link StartTag} objects with the specified {@linkplain StartTag#getName() name} that are {@linkplain #encloses(Segment) enclosed} by this segment.
341 */
342 public List<StartTag> getAllStartTags(String name) {
343 if (name==null) return getAllStartTags();
344 final boolean isXMLTagName=Tag.isXMLName(name);
345 StartTag startTag=checkEnclosure(StartTag.getNext(source,begin,name,StartTagType.NORMAL,isXMLTagName));
346 if (startTag==null) return Collections.emptyList();
347 final ArrayList<StartTag> list=new ArrayList<StartTag>();
348 do {
349 list.add(startTag);
350 startTag=checkEnclosure(StartTag.getNext(source,startTag.begin+1,name,StartTagType.NORMAL,isXMLTagName));
351 } while (startTag!=null);
352 return list;
353 }
354
355 /**
356 * Returns a list of all {@link StartTag} objects with the specified attribute name/value pair that are {@linkplain #encloses(Segment) enclosed} by this segment.
357 * <p>
358 * See the {@link Tag} class documentation for more details about the behaviour of this method.
359 *
360 * @param attributeName the attribute name (case insensitive) to search for, must not be <code>null</code>.
361 * @param value the value of the specified attribute to search for, must not be <code>null</code>.
362 * @param valueCaseSensitive specifies whether the attribute value matching is case sensitive.
363 * @return a list of all {@link StartTag} objects with the specified attribute name/value pair that are {@linkplain #encloses(Segment) enclosed} by this segment.
364 * @see #getAllStartTags(String attributeName, Pattern valueRegexPattern)
365 */
366 public List<StartTag> getAllStartTags(final String attributeName, final String value, final boolean valueCaseSensitive) {
367 StartTag startTag=checkEnclosure(source.getNextStartTag(begin,attributeName,value,valueCaseSensitive));
368 if (startTag==null) return Collections.emptyList();
369 final ArrayList<StartTag> list=new ArrayList<StartTag>();
370 do {
371 list.add(startTag);
372 startTag=checkEnclosure(source.getNextStartTag(startTag.begin+1,attributeName,value,valueCaseSensitive));
373 } while (startTag!=null);
374 return list;
375 }
376
377 /**
378 * Returns a list of all {@link StartTag} objects with the specified attribute name and value pattern that are {@linkplain #encloses(Segment) enclosed} by this segment.
379 * <p>
380 * Specifying a <code>null</code> argument to the <code>valueRegexPattern</code> parameter performs the search on the attribute name only,
381 * without regard to the attribute value. This will also match an attribute that {@linkplain Attribute#hasValue() has no value} at all.
382 * <p>
383 * See the {@link Tag} class documentation for more details about the behaviour of this method.
384 *
385 * @param attributeName the attribute name (case insensitive) to search for, must not be <code>null</code>.
386 * @param valueRegexPattern the regular expression pattern that must match the attribute value, may be <code>null</code>.
387 * @return a list of all {@link StartTag} objects with the specified attribute name and value pattern that are {@linkplain #encloses(Segment) enclosed} by this segment.
388 * @see #getAllStartTags(String attributeName, String value, boolean valueCaseSensitive)
389 */
390 public List<StartTag> getAllStartTags(final String attributeName, final Pattern valueRegexPattern) {
391 StartTag startTag=checkEnclosure(source.getNextStartTag(begin,attributeName,valueRegexPattern));
392 if (startTag==null) return Collections.emptyList();
393 final ArrayList<StartTag> list=new ArrayList<StartTag>();
394 do {
395 list.add(startTag);
396 startTag=checkEnclosure(source.getNextStartTag(startTag.begin+1,attributeName,valueRegexPattern));
397 } while (startTag!=null);
398 return list;
399 }
400
401 /**
402 * Returns a list of all {@link StartTag} objects with the specified class that are {@linkplain #encloses(Segment) enclosed} by this segment.
403 * <p>
404 * This matches start tags with a <code>class</code> attribute that contains the specified class name, either as an exact match or where the specified class name is one of multiple
405 * class names separated by white space in the attribute value.
406 * <p>
407 * See the {@link Tag} class documentation for more details about the behaviour of this method.
408 *
409 * @param className the class name (case sensitive) to search for, must not be <code>null</code>.
410 * @return a list of all {@link StartTag} objects with the specified class that are {@linkplain #encloses(Segment) enclosed} by this segment.
411 */
412 public List<StartTag> getAllStartTagsByClass(final String className) {
413 return getAllStartTags("class",getClassPattern(className));
414 }
415
416 /**
417 * Returns a list of the immediate children of this segment in the document element hierarchy.
418 * <p>
419 * The returned list may include an element that extends beyond the end of this segment, as long as it begins within this segment.
420 * <p>
421 * An element found at the start of this segment is included in the list.
422 * Note however that if this segment <i>is</i> an {@link Element}, the overriding {@link Element#getChildElements()} method is called instead,
423 * which only returns the children of the element.
424 * <p>
425 * Calling <code>getChildElements()</code> on an <code>Element</code> is much more efficient than calling it on a <code>Segment</code>.
426 * <p>
427 * The objects in the list are all of type {@link Element}.
428 * <p>
429 * The {@link Source#fullSequentialParse()} method should be called after construction of the {@link Source} object
430 * if this method is to be used on a large proportion of the source.
431 * It is called automatically if this method is called on the {@link Source} object itself.
432 * <p>
433 * See the {@link Source#getChildElements()} method for more details.
434 *
435 * @return the a list of the immediate children of this segment in the document element hierarchy, guaranteed not <code>null</code>.
436 * @see Element#getParentElement()
437 */
438 public List<Element> getChildElements() {
439 if (length()==0) return Collections.emptyList();
440 List<Element> childElements=new ArrayList<Element>();
441 int pos=begin;
442 while (true) {
443 final StartTag childStartTag=source.getNextStartTag(pos);
444 if (childStartTag==null || childStartTag.begin>=end) break;
445 if (!Config.IncludeServerTagsInElementHierarchy && childStartTag.getTagType().isServerTag()) {
446 pos=childStartTag.end;
447 continue;
448 }
449 final Element childElement=childStartTag.getElement();
450 childElements.add(childElement);
451 childElement.getChildElements();
452 pos=childElement.end;
453 }
454 return childElements;
455 }
456
457 /**
458 * Returns a list of all {@link Element} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
459 * <p>
460 * The {@link Source#fullSequentialParse()} method should be called after construction of the {@link Source} object
461 * if this method is to be used on a large proportion of the source.
462 * It is called automatically if this method is called on the {@link Source} object itself.
463 * <p>
464 * The elements returned correspond exactly with the start tags returned in the {@link #getAllStartTags()} method.
465 * <p>
466 * If this segment is itself an {@link Element}, the result includes this element in the list.
467 *
468 * @return a list of all {@link Element} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
469 */
470 public List<Element> getAllElements() {
471 return getAllElements(getAllStartTags());
472 }
473
474 /**
475 * Returns a list of all {@link Element} objects with the specified name that are {@linkplain #encloses(Segment) enclosed} by this segment.
476 * <p>
477 * The elements returned correspond with the start tags returned in the {@link #getAllStartTags(String name)} method,
478 * except that elements which are not entirely enclosed by this segment are excluded.
479 * <p>
480 * Specifying a <code>null</code> argument to the <code>name</code> parameter is equivalent to {@link #getAllElements()}, which may include elements of non-{@linkplain StartTagType#NORMAL normal} tags.
481 * <p>
482 * This method also returns elements consisting of {@linkplain Tag#isUnregistered() unregistered} tags if the specified name is not a valid {@linkplain Tag#isXMLName(CharSequence) XML tag name}.
483 * <p>
484 * If this segment is itself an {@link Element} with the specified name, the result includes this element in the list.
485 *
486 * @param name the {@linkplain Element#getName() name} of the elements to get.
487 * @return a list of all {@link Element} objects with the specified name that are {@linkplain #encloses(Segment) enclosed} by this segment.
488 */
489 public List<Element> getAllElements(String name) {
490 return getAllElements(getAllStartTags(name));
491 }
492
493 /**
494 * Returns a list of all {@link Element} objects with start tags of the specified {@linkplain StartTagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
495 * <p>
496 * The elements returned correspond with the start tags returned in the {@link #getAllTags(TagType)} method,
497 * except that elements which are not entirely enclosed by this segment are excluded.
498 * <p>
499 * If this segment is itself an {@link Element} with the specified type, the result includes this element in the list.
500 *
501 * @param startTagType the {@linkplain StartTagType type} of start tags to get, must not be <code>null</code>.
502 * @return a list of all {@link Element} objects with start tags of the specified {@linkplain StartTagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
503 */
504 public List<Element> getAllElements(final StartTagType startTagType) {
505 if (startTagType==null) throw new IllegalArgumentException("startTagType argument must not be null");
506 return getAllElements(getAllStartTags(startTagType));
507 }
508
509 /**
510 * Returns a list of all {@link Element} objects with the specified attribute name/value pair that are {@linkplain #encloses(Segment) enclosed} by this segment.
511 * <p>
512 * The elements returned correspond with the start tags returned in the {@link #getAllStartTags(String attributeName, String value, boolean valueCaseSensitive)} method,
513 * except that elements which are not entirely enclosed by this segment are excluded.
514 * <p>
515 * If this segment is itself an {@link Element} with the specified name/value pair, the result includes this element in the list.
516 *
517 * @param attributeName the attribute name (case insensitive) to search for, must not be <code>null</code>.
518 * @param value the value of the specified attribute to search for, must not be <code>null</code>.
519 * @param valueCaseSensitive specifies whether the attribute value matching is case sensitive.
520 * @return a list of all {@link Element} objects with the specified attribute name/value pair that are {@linkplain #encloses(Segment) enclosed} by this segment.
521 * @see #getAllElements(String attributeName, Pattern valueRegexPattern)
522 */
523 public List<Element> getAllElements(final String attributeName, final String value, final boolean valueCaseSensitive) {
524 return getAllElements(getAllStartTags(attributeName,value,valueCaseSensitive));
525 }
526
527 /**
528 * Returns a list of all {@link Element} objects with the specified attribute name and value pattern that are {@linkplain #encloses(Segment) enclosed} by this segment.
529 * <p>
530 * The elements returned correspond with the start tags returned in the {@link #getAllStartTags(String attributeName, Pattern valueRegexPattern)} method,
531 * except that elements which are not entirely enclosed by this segment are excluded.
532 * <p>
533 * Specifying a <code>null</code> argument to the <code>valueRegexPattern</code> parameter performs the search on the attribute name only,
534 * without regard to the attribute value. This will also match an attribute that {@linkplain Attribute#hasValue() has no value} at all.
535 * <p>
536 * If this segment is itself an {@link Element} with the specified attribute name and value pattern, the result includes this element in the list.
537 *
538 * @param attributeName the attribute name (case insensitive) to search for, must not be <code>null</code>.
539 * @param valueRegexPattern the regular expression pattern that must match the attribute value, may be <code>null</code>.
540 * @return a list of all {@link Element} objects with the specified attribute name and value pattern that are {@linkplain #encloses(Segment) enclosed} by this segment.
541 * @see #getAllElements(String attributeName, String value, boolean valueCaseSensitive)
542 */
543 public List<Element> getAllElements(final String attributeName, final Pattern valueRegexPattern) {
544 return getAllElements(getAllStartTags(attributeName,valueRegexPattern));
545 }
546
547 /**
548 * Returns a list of all {@link Element} objects with the specified class that are {@linkplain #encloses(Segment) enclosed} by this segment.
549 * <p>
550 * This matches elements with a <code>class</code> attribute that contains the specified class name, either as an exact match or where the specified class name is one of multiple
551 * class names separated by white space in the attribute value.
552 * <p>
553 * The elements returned correspond with the start tags returned in the {@link #getAllStartTagsByClass(String className)} method,
554 * except that elements which are not entirely enclosed by this segment are excluded.
555 * <p>
556 * If this segment is itself an {@link Element} with the specified class, the result includes this element in the list.
557 *
558 * @param className the class name (case sensitive) to search for, must not be <code>null</code>.
559 * @return a list of all {@link Element} objects with the specified class that are {@linkplain #encloses(Segment) enclosed} by this segment.
560 */
561 public List<Element> getAllElementsByClass(final String className) {
562 return getAllElements(getAllStartTagsByClass(className));
563 }
564
565 /**
566 * Returns a list of all {@link CharacterReference} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
567 * @return a list of all {@link CharacterReference} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
568 */
569 public List<CharacterReference> getAllCharacterReferences() {
570 CharacterReference characterReference=getNextCharacterReference(begin);
571 if (characterReference==null) return Collections.emptyList();
572 final ArrayList<CharacterReference> list=new ArrayList<CharacterReference>();
573 do {
574 list.add(characterReference);
575 characterReference=getNextCharacterReference(characterReference.end);
576 } while (characterReference!=null);
577 return list;
578 }
579
580 /**
581 * Returns a list of all {@linkplain Attribute attributes} {@linkplain #encloses(Segment) enclosed} by this segment that have <a target="_blank" href="http://en.wikipedia.org/wiki/URI">URI</a> values.
582 * <p>
583 * According to the <a target="_blank" href="http://www.w3.org/TR/html401/">HTML 4.01 specification</a>, the following attributes have URI values:
584 * <table class="bordered" cellspacing="0">
585 * <tr><th>HTML element name<th>Attribute name
586 * <tr><td>{@link HTMLElementName#A A}<td>href
587 * <tr><td>{@link HTMLElementName#APPLET APPLET}<td>codebase
588 * <tr><td>{@link HTMLElementName#AREA AREA}<td>href
589 * <tr><td>{@link HTMLElementName#BASE BASE}<td>href
590 * <tr><td>{@link HTMLElementName#BLOCKQUOTE BLOCKQUOTE}<td>cite
591 * <tr><td>{@link HTMLElementName#BODY BODY}<td>background
592 * <tr><td>{@link HTMLElementName#FORM FORM}<td>action
593 * <tr><td>{@link HTMLElementName#FRAME FRAME}<td>longdesc
594 * <tr><td>{@link HTMLElementName#FRAME FRAME}<td>src
595 * <tr><td>{@link HTMLElementName#DEL DEL}<td>cite
596 * <tr><td>{@link HTMLElementName#HEAD HEAD}<td>profile
597 * <tr><td>{@link HTMLElementName#IFRAME IFRAME}<td>longdesc
598 * <tr><td>{@link HTMLElementName#IFRAME IFRAME}<td>src
599 * <tr><td>{@link HTMLElementName#IMG IMG}<td>longdesc
600 * <tr><td>{@link HTMLElementName#IMG IMG}<td>src
601 * <tr><td>{@link HTMLElementName#IMG IMG}<td>usemap
602 * <tr><td>{@link HTMLElementName#INPUT INPUT}<td>src
603 * <tr><td>{@link HTMLElementName#INPUT INPUT}<td>usemap
604 * <tr><td>{@link HTMLElementName#INS INS}<td>cite
605 * <tr><td>{@link HTMLElementName#LINK LINK}<td>href
606 * <tr><td>{@link HTMLElementName#OBJECT OBJECT}<td>classid
607 * <tr><td>{@link HTMLElementName#OBJECT OBJECT}<td>codebase
608 * <tr><td>{@link HTMLElementName#OBJECT OBJECT}<td>data
609 * <tr><td>{@link HTMLElementName#OBJECT OBJECT}<td>usemap
610 * <tr><td>{@link HTMLElementName#Q Q}<td>cite
611 * <tr><td>{@link HTMLElementName#SCRIPT SCRIPT}<td>src
612 * </table>
613 * <p>
614 * Attributes from other elements may also be returned if the attribute name matches one of those in the list above.
615 * <p>
616 * This method is often used in conjunction with the {@link #getStyleURISegments()} method in order to find all URIs in a document.
617 * <p>
618 * The attributes are returned in order of appearance.
619 *
620 * @return a list of all {@linkplain Attribute attributes} {@linkplain #encloses(Segment) enclosed} by this segment that have <a target="_blank" href="http://en.wikipedia.org/wiki/URI">URI</a> values.
621 * @see #getStyleURISegments()
622 */
623 public List<Attribute> getURIAttributes() {
624 return URIAttributes.getList(this);
625 }
626
627 /**
628 * Returns a list of all <a target="_blank" href="http://en.wikipedia.org/wiki/URI">URI</a> {@linkplain Segment segments}
629 * inside <code>style</code> attribute values {@linkplain #encloses(Segment) enclosed} by this segment.
630 * <p>
631 * If this segment does not contain any tags, the entire segment is assumed to be a <code>style</code> attribute value.
632 * <p>
633 * The URI segments are found by searching the <code>style</code> attribute values for the functional notation "<code>url()</code>" as described in
634 * <a target="_blank" href="http://www.w3.org/TR/CSS2/syndata.html#uri">section 4.3.4 of the CSS2 specification</a>.
635 * <p>
636 * The segments are returned in order of appearance.
637 *
638 * @return a list of all <a target="_blank" href="http://en.wikipedia.org/wiki/URI">URI</a> {@linkplain Segment segments} inside <code>style</code> attribute values {@linkplain #encloses(Segment) enclosed} by this segment.
639 * @see #getURIAttributes()
640 */
641 public List<Segment> getStyleURISegments() {
642 return URIAttributes.getStyleURISegments(this);
643 }
644
645 /**
646 * Returns the first {@link StartTag} {@linkplain #encloses(Segment) enclosed} by this segment.
647 * <p>
648 * This is functionally equivalent to {@link #getAllStartTags()}<code>.iterator().next()</code>,
649 * but does not search beyond the first start tag and returns <code>null</code> if no such start tag exists.
650 *
651 * @return the first {@link StartTag} {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
652 */
653 public final StartTag getFirstStartTag() {
654 return checkEnclosure(source.getNextStartTag(begin));
655 }
656
657 /**
658 * Returns the first {@link StartTag} of the specified {@linkplain StartTagType type} {@linkplain #encloses(Segment) enclosed} by this segment.
659 * <p>
660 * This is functionally equivalent to {@link #getAllStartTags(StartTagType) getAllStartTags(startTagType)}<code>.iterator().next()</code>,
661 * but does not search beyond the first start tag and returns <code>null</code> if no such start tag exists.
662 *
663 * @param startTagType the <code>StartTagType</code> to search for.
664 * @return the first {@link StartTag} of the specified {@linkplain StartTagType type} {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
665 */
666 public final StartTag getFirstStartTag(StartTagType startTagType) {
667 return checkEnclosure(source.getNextStartTag(begin,startTagType));
668 }
669
670 /**
671 * Returns the first {@linkplain StartTagType#NORMAL normal} {@link StartTag} {@linkplain #encloses(Segment) enclosed} by this segment.
672 * <p>
673 * This is functionally equivalent to {@link #getAllStartTags(String) getAllStartTags(name)}<code>.iterator().next()</code>,
674 * but does not search beyond the first start tag and returns <code>null</code> if no such start tag exists.
675 * <p>
676 * Specifying a <code>null</code> argument to the <code>name</code> parameter is equivalent to {@link #getFirstStartTag()}.
677 *
678 * @param name the {@linkplain StartTag#getName() name} of the start tag to search for, may be <code>null</code>.
679 * @return the first {@linkplain StartTagType#NORMAL normal} {@link StartTag} {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
680 */
681 public final StartTag getFirstStartTag(String name) {
682 return checkEnclosure(source.getNextStartTag(begin,name));
683 }
684
685 /**
686 * Returns the first {@link StartTag} with the specified attribute name/value pair {@linkplain #encloses(Segment) enclosed} by this segment.
687 * <p>
688 * This is functionally equivalent to {@link #getAllStartTags(String,String,boolean) getAllStartTags(attributeName,value,valueCaseSensitive)}<code>.iterator().next()</code>,
689 * but does not search beyond the first start tag and returns <code>null</code> if no such start tag exists.
690 *
691 * @param attributeName the attribute name (case insensitive) to search for, must not be <code>null</code>.
692 * @param value the value of the specified attribute to search for, must not be <code>null</code>.
693 * @param valueCaseSensitive specifies whether the attribute value matching is case sensitive.
694 * @return the first {@link StartTag} with the specified attribute name/value pair {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
695 * @see #getFirstStartTag(String attributeName, Pattern valueRegexPattern)
696 */
697 public final StartTag getFirstStartTag(String attributeName, String value, boolean valueCaseSensitive) {
698 return checkEnclosure(source.getNextStartTag(begin,attributeName,value,valueCaseSensitive));
699 }
700
701 /**
702 * Returns the first {@link StartTag} with the specified attribute name and value pattern that is {@linkplain #encloses(Segment) enclosed} by this segment.
703 * <p>
704 * This is functionally equivalent to {@link #getAllStartTags(String,Pattern) getAllStartTags(attributeName,valueRegexPattern)}<code>.iterator().next()</code>,
705 * but does not search beyond the first start tag and returns <code>null</code> if no such start tag exists.
706 *
707 * @param attributeName the attribute name (case insensitive) to search for, must not be <code>null</code>.
708 * @param valueRegexPattern the regular expression pattern that must match the attribute value, may be <code>null</code>.
709 * @return the first {@link StartTag} with the specified attribute name and value pattern that is {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
710 * @see #getFirstStartTag(String attributeName, String value, boolean valueCaseSensitive)
711 */
712 public final StartTag getFirstStartTag(final String attributeName, final Pattern valueRegexPattern) {
713 return checkEnclosure(source.getNextStartTag(begin,attributeName,valueRegexPattern));
714 }
715
716 /**
717 * Returns the first {@link StartTag} with the specified class that is {@linkplain #encloses(Segment) enclosed} by this segment.
718 * <p>
719 * This is functionally equivalent to {@link #getAllStartTagsByClass(String) getAllStartTagsByClass(className)}<code>.iterator().next()</code>,
720 * but does not search beyond the first start tag and returns <code>null</code> if no such start tag exists.
721 *
722 * @param className the class name (case sensitive) to search for, must not be <code>null</code>.
723 * @return the first {@link StartTag} with the specified class that is {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
724 */
725 public final StartTag getFirstStartTagByClass(final String className) {
726 return checkEnclosure(source.getNextStartTagByClass(begin,className));
727 }
728
729 /**
730 * Returns the first {@link Element} {@linkplain #encloses(Segment) enclosed} by this segment.
731 * <p>
732 * This is functionally equivalent to {@link #getAllElements()}<code>.iterator().next()</code>,
733 * but does not search beyond the first enclosed element and returns <code>null</code> if no such element exists.
734 * <p>
735 * If this segment is itself an {@link Element}, this element is returned, not the first child element.
736 *
737 * @return the first {@link Element} {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
738 */
739 public final Element getFirstElement() {
740 StartTag startTag=checkEnclosure(StartTag.getNext(source,begin));
741 while (startTag!=null) {
742 final Element element=startTag.getElement();
743 if (element.end<=end) return element;
744 startTag=checkEnclosure(startTag.getNextStartTag());
745 }
746 return null;
747 }
748
749 /**
750 * Returns the first {@linkplain StartTagType#NORMAL normal} {@link Element} with the specified {@linkplain Element#getName() name} {@linkplain #encloses(Segment) enclosed} by this segment.
751 * <p>
752 * This is functionally equivalent to {@link #getAllElements(String) getAllElements(name)}<code>.iterator().next()</code>,
753 * but does not search beyond the first enclosed element and returns <code>null</code> if no such element exists.
754 * <p>
755 * Specifying a <code>null</code> argument to the <code>name</code> parameter is equivalent to {@link #getFirstElement()}.
756 * <p>
757 * If this segment is itself an {@link Element} with the specified name, this element is returned.
758 *
759 * @param name the {@linkplain Element#getName() name} of the element to search for.
760 * @return the first {@linkplain StartTagType#NORMAL normal} {@link Element} with the specified {@linkplain Element#getName() name} {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
761 */
762 public final Element getFirstElement(String name) {
763 if (name==null) return getFirstElement();
764 final boolean isXMLTagName=Tag.isXMLName(name);
765 StartTag startTag=checkEnclosure(StartTag.getNext(source,begin,name,StartTagType.NORMAL,isXMLTagName));
766 while (startTag!=null) {
767 final Element element=startTag.getElement();
768 if (element.end<=end) return element;
769 startTag=checkEnclosure(StartTag.getNext(source,startTag.begin+1,name,StartTagType.NORMAL,isXMLTagName));
770 }
771 return null;
772 }
773
774 /**
775 * Returns the first {@link Element} with the specified attribute name/value pair {@linkplain #encloses(Segment) enclosed} by this segment.
776 * <p>
777 * This is functionally equivalent to {@link #getAllElements(String,String,boolean) getAllElements(attributeName,value,valueCaseSensitive)}<code>.iterator().next()</code>,
778 * but does not search beyond the first enclosed element and returns <code>null</code> if no such element exists.
779 * <p>
780 * If this segment is itself an {@link Element} with the specified attribute name/value pair, this element is returned.
781 *
782 * @param attributeName the attribute name (case insensitive) to search for, must not be <code>null</code>.
783 * @param value the value of the specified attribute to search for, must not be <code>null</code>.
784 * @param valueCaseSensitive specifies whether the attribute value matching is case sensitive.
785 * @return the first {@link Element} with the specified attribute name/value pair {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
786 * @see #getFirstElement(String attributeName, Pattern valueRegexPattern)
787 */
788 public final Element getFirstElement(String attributeName, String value, boolean valueCaseSensitive) {
789 StartTag startTag=checkEnclosure(source.getNextStartTag(begin,attributeName,value,valueCaseSensitive));
790 while (startTag!=null) {
791 final Element element=startTag.getElement();
792 if (element.end<=end) return element;
793 startTag=checkEnclosure(source.getNextStartTag(startTag.begin+1,attributeName,value,valueCaseSensitive));
794 }
795 return null;
796 }
797
798 /**
799 * Returns the first {@link Element} with the specified attribute name and value pattern that is {@linkplain #encloses(Segment) enclosed} by this segment.
800 * <p>
801 * This is functionally equivalent to {@link #getAllElements(String,Pattern) getAllElements(attributeName,valueRegexPattern)}<code>.iterator().next()</code>,
802 * but does not search beyond the first enclosed element and returns <code>null</code> if no such element exists.
803 * <p>
804 * If this segment is itself an {@link Element} with the specified attribute name and value pattern, this element is returned.
805 *
806 * @param attributeName the attribute name (case insensitive) to search for, must not be <code>null</code>.
807 * @param valueRegexPattern the regular expression pattern that must match the attribute value, may be <code>null</code>.
808 * @return the first {@link Element} with the specified attribute name and value pattern that is {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
809 * @see #getFirstElement(String attributeName, String value, boolean valueCaseSensitive)
810 */
811 public final Element getFirstElement(final String attributeName, final Pattern valueRegexPattern) {
812 StartTag startTag=checkEnclosure(source.getNextStartTag(begin,attributeName,valueRegexPattern));
813 while (startTag!=null) {
814 final Element element=startTag.getElement();
815 if (element.end<=end) return element;
816 startTag=checkEnclosure(source.getNextStartTag(startTag.begin+1,attributeName,valueRegexPattern));
817 }
818 return null;
819 }
820
821 /**
822 * Returns the first {@link Element} with the specified class that is {@linkplain #encloses(Segment) enclosed} by this segment.
823 * <p>
824 * This is functionally equivalent to {@link #getAllElementsByClass(String) getAllElementsByClass(className)}<code>.iterator().next()</code>,
825 * but does not search beyond the first enclosed element and returns <code>null</code> if no such element exists.
826 * <p>
827 * If this segment is itself an {@link Element} with the specified class, this element is returned.
828 *
829 * @param className the class name (case sensitive) to search for, must not be <code>null</code>.
830 * @return the first {@link Element} with the specified class that is {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
831 */
832 public final Element getFirstElementByClass(final String className) {
833 StartTag startTag=checkEnclosure(source.getNextStartTagByClass(begin,className));
834 while (startTag!=null) {
835 final Element element=startTag.getElement();
836 if (element.end<=end) return element;
837 startTag=checkEnclosure(source.getNextStartTagByClass(startTag.begin+1,className));
838 }
839 return null;
840 }
841
842 /**
843 * Returns a list of the {@link FormControl} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
844 * @return a list of the {@link FormControl} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
845 */
846 public List<FormControl> getFormControls() {
847 return FormControl.getAll(this);
848 }
849
850 /**
851 * Returns the {@link FormFields} object representing all form fields that are {@linkplain #encloses(Segment) enclosed} by this segment.
852 * <p>
853 * This is equivalent to {@link FormFields#FormFields(Collection) new FormFields}<code>(</code>{@link #getFormControls()}<code>)</code>.
854 *
855 * @return the {@link FormFields} object representing all form fields that are {@linkplain #encloses(Segment) enclosed} by this segment.
856 * @see #getFormControls()
857 */
858 public FormFields getFormFields() {
859 return new FormFields(getFormControls());
860 }
861
862 /**
863 * Parses any {@link Attributes} within this segment.
864 * This method is only used in the unusual situation where attributes exist outside of a start tag.
865 * The {@link StartTag#getAttributes()} method should be used in normal situations.
866 * <p>
867 * This is equivalent to <code>source.</code>{@link Source#parseAttributes(int,int) parseAttributes}<code>(</code>{@link #getBegin()}<code>,</code>{@link #getEnd()}<code>)</code>.
868 *
869 * @return the {@link Attributes} within this segment, or <code>null</code> if too many errors occur while parsing.
870 */
871 public Attributes parseAttributes() {
872 return source.parseAttributes(begin,end);
873 }
874
875 /**
876 * Causes the this segment to be ignored when parsing.
877 * <p>
878 * Ignored segments are treated as blank spaces by the parsing mechanism, but are included as normal text in all other functions.
879 * <p>
880 * This method was originally the only means of preventing {@linkplain TagType#isServerTag() server tags} located inside
881 * {@linkplain StartTagType#NORMAL normal} tags from interfering with the parsing of the tags
882 * (such as where an {@linkplain Attribute attribute} of a normal tag uses a server tag to dynamically set its value),
883 * as well as preventing non-server tags from being recognised inside server tags.
884 * <p>
885 * It is not necessary to use this method to ignore {@linkplain TagType#isServerTag() server tags} located inside normal tags,
886 * as the attributes parser automatically ignores any server tags.
887 * <p>
888 * It is not necessary to use this method to ignore non-server tags inside server tags, or the contents of {@link HTMLElementName#SCRIPT SCRIPT} elements,
889 * as the parser does this automatically when performing a {@linkplain Source#fullSequentialParse() full sequential parse}.
890 * <p>
891 * This leaves only very few scenarios where calling this method still provides a significant benefit.
892 * <p>
893 * One such case is where XML-style server tags are used inside {@linkplain StartTagType#NORMAL normal} tags.
894 * Here is an example using an XML-style JSP tag:
895 * <blockquote class="code"><code>&lt;a href="&lt;i18n:resource path="/Portal"/&gt;?BACK=TRUE"&gt;back&lt;/a&gt;</code></blockquote>
896 * The first double-quote of <code>"/Portal"</code> will be interpreted as the end quote for the <code>href</code> attribute,
897 * as there is no way for the parser to recognise the <code>il8n:resource</code> element as a server tag.
898 * Such use of XML-style server tags inside {@linkplain StartTagType#NORMAL normal} tags is generally seen as bad practice,
899 * but it is nevertheless valid JSP. The only way to ensure that this library is able to parse the normal tag surrounding it is to
900 * find these server tags first and call the <code>ignoreWhenParsing</code> method to ignore them before parsing the rest of the document.
901 * <p>
902 * It is important to understand the difference between ignoring the segment when parsing and removing the segment completely.
903 * Any text inside a segment that is ignored when parsing is treated by most functions as content, and as such is included in the output of
904 * tools such as {@link TextExtractor} and {@link Renderer}.
905 * <p>
906 * To remove segments completely, create an {@link OutputDocument} and call its {@link OutputDocument#remove(Segment) remove(Segment)} or
907 * {@link OutputDocument#replaceWithSpaces(int,int) replaceWithSpaces(int begin, int end)} method for each segment.
908 * Then create a new source document using {@link Source#Source(CharSequence) new Source(outputDocument.toString())}
909 * and perform the desired operations on this new source object.
910 * <p>
911 * Calling this method after the {@link Source#fullSequentialParse()} method has been called is not permitted and throws an <code>IllegalStateException</code>.
912 * <p>
913 * Any tags appearing in this segment that are found before this method is called will remain in the {@linkplain Source#getCacheDebugInfo() tag cache},
914 * and so will continue to be found by the <a href="Tag.html#TagSearchMethods">tag search methods</a>.
915 * If this is undesirable, the {@link Source#clearCache()} method can be called to remove them from the cache.
916 * Calling the {@link Source#fullSequentialParse()} method after this method clears the cache automatically.
917 * <p>
918 * For best performance, this method should be called on all segments that need to be ignored without calling
919 * any of the <a href="Tag.html#TagSearchMethods">tag search methods</a> in between.
920 *
921 * @see Source#ignoreWhenParsing(Collection segments)
922 */
923 public void ignoreWhenParsing() {
924 source.ignoreWhenParsing(begin,end);
925 }
926
927 /**
928 * Compares this <code>Segment</code> object to another object.
929 * <p>
930 * If the argument is not a <code>Segment</code>, a <code>ClassCastException</code> is thrown.
931 * <p>
932 * A segment is considered to be before another segment if its begin position is earlier,
933 * or in the case that both segments begin at the same position, its end position is earlier.
934 * <p>
935 * Segments that begin and end at the same position are considered equal for
936 * the purposes of this comparison, even if they relate to different source documents.
937 * <p>
938 * Note: this class has a natural ordering that is inconsistent with equals.
939 * This means that this method may return zero in some cases where calling the
940 * {@link #equals(Object)} method with the same argument returns <code>false</code>.
941 *
942 * @param segment the segment to be compared
943 * @return a negative integer, zero, or a positive integer as this segment is before, equal to, or after the specified segment.
944 * @throws ClassCastException if the argument is not a <code>Segment</code>
945 */
946 public int compareTo(final Segment segment) {
947 if (this==segment) return 0;
948 if (begin<segment.begin) return -1;
949 if (begin>segment.begin) return 1;
950 if (end<segment.end) return -1;
951 if (end>segment.end) return 1;
952 return 0;
953 }
954
955 /**
956 * Indicates whether this segment consists entirely of {@linkplain #isWhiteSpace(char) white space}.
957 * @return <code>true</code> if this segment consists entirely of {@linkplain #isWhiteSpace(char) white space}, otherwise <code>false</code>.
958 */
959 public final boolean isWhiteSpace() {
960 for (int i=begin; i<end; i++)
961 if (!isWhiteSpace(source.charAt(i))) return false;
962 return true;
963 }
964
965 /**
966 * Returns an indication of the maximum depth of nested elements within this segment.
967 * <p>
968 * A high return value can indicate that the segment contains a large number of incorrectly nested tags that could result in a <code>StackOverflowException</code>
969 * if its content is parsed.
970 * <p>
971 * The usefulness of this method is debatable as a <code>StackOverflowException</code> is a recoverable error that can be easily caught.
972 * The use of this method to pre-detect and avoid a stack overflow may save some memory and processing resources in certain circumstances, but the cost of calling
973 * this method to check every segment or document will very often exceed any benefit.
974 * <p>
975 * It is up to the application developer to determine what return value constitutes an unreasonable level of nesting given the stack space allocated to the application
976 * and other factors.
977 * <p>
978 * Note that the return value is an approximation only and is usually greater than the actual maximum element depth that would be reported by calling the
979 * {@link Element#getDepth()} method on the most nested element.
980 *
981 * @return an indication of the maximum depth of nested elements within this segment.
982 */
983 public int getMaxDepthIndicator() {
984 int maxDepth=0;
985 int depth=0;
986 for (Tag tag : getAllTags()) {
987 if (tag instanceof StartTag) {
988 StartTag startTag=(StartTag)tag;
989 if (startTag.getStartTagType().getCorrespondingEndTagType()==null) continue;
990 if (HTMLElements.getEndTagForbiddenElementNames().contains(startTag.getName())) continue;
991 if (startTag.isEmptyElementTag()) continue;
992 depth++;
993 if (depth>maxDepth) maxDepth++;
994 } else {
995 depth--;
996 }
997 }
998 return maxDepth;
999 }
1000
1001 /**
1002 * Indicates whether the specified character is <a target="_blank" href="http://www.w3.org/TR/html401/struct/text.html#h-9.1">white space</a>.
1003 * <p>
1004 * The <a target="_blank" href="http://www.w3.org/TR/html401/struct/text.html#h-9.1">HTML 4.01 specification section 9.1</a>
1005 * specifies the following white space characters:
1006 * <ul>
1007 * <li>space (U+0020)
1008 * <li>tab (U+0009)
1009 * <li>form feed (U+000C)
1010 * <li>line feed (U+000A)
1011 * <li>carriage return (U+000D)
1012 * <li>zero-width space (U+200B)
1013 * </ul>
1014 * <p>
1015 * Despite the explicit inclusion of the zero-width space in the HTML specification, Microsoft IE6 does not
1016 * recognise them as white space and renders them as an unprintable character (empty square).
1017 * Even zero-width spaces included using the numeric character reference <code>&amp;#x200B;</code> are rendered this way.
1018 *
1019 * @param ch the character to test.
1020 * @return <code>true</code> if the specified character is <a target="_blank" href="http://www.w3.org/TR/html401/struct/text.html#h-9.1">white space</a>, otherwise <code>false</code>.
1021 */
1022 public static final boolean isWhiteSpace(final char ch) {
1023 for (char whiteSpaceChar : WHITESPACE) if (ch==whiteSpaceChar) return true;
1024 return false;
1025 }
1026
1027 /**
1028 * Returns a string representation of this object useful for debugging purposes.
1029 * @return a string representation of this object useful for debugging purposes.
1030 */
1031 public String getDebugInfo() {
1032 final StringBuilder sb=new StringBuilder(50);
1033 sb.append('(');
1034 source.getRowColumnVector(begin).appendTo(sb);
1035 sb.append('-');
1036 source.getRowColumnVector(end).appendTo(sb);
1037 sb.append(')');
1038 return sb.toString();
1039 }
1040
1041 /**
1042 * Returns the character at the specified index.
1043 * <p>
1044 * This is logically equivalent to <code>toString().charAt(index)</code>
1045 * for valid argument values <code>0 <= index < length()</code>.
1046 * <p>
1047 * However because this implementation works directly on the underlying document source string,
1048 * it should not be assumed that an <code>IndexOutOfBoundsException</code> is thrown
1049 * for an invalid argument value.
1050 *
1051 * @param index the index of the character.
1052 * @return the character at the specified index.
1053 */
1054 public char charAt(final int index) {
1055 return source.charAt(begin+index);
1056 }
1057
1058 /**
1059 * Returns a new character sequence that is a subsequence of this sequence.
1060 * <p>
1061 * This is logically equivalent to <code>toString().subSequence(beginIndex,endIndex)</code>
1062 * for valid values of <code>beginIndex</code> and <code>endIndex</code>.
1063 * <p>
1064 * However because this implementation works directly on the underlying document source text,
1065 * it should not be assumed that an <code>IndexOutOfBoundsException</code> is thrown
1066 * for invalid argument values as described in the <code>String.subSequence(int,int)</code> method.
1067 *
1068 * @param beginIndex the begin index, inclusive.
1069 * @param endIndex the end index, exclusive.
1070 * @return a new character sequence that is a subsequence of this sequence.
1071 */
1072 public CharSequence subSequence(final int beginIndex, final int endIndex) {
1073 return source.subSequence(begin+beginIndex,begin+endIndex);
1074 }
1075
1076 /**
1077 * Collapses the {@linkplain #isWhiteSpace(char) white space} in the specified text.
1078 * All leading and trailing white space is omitted, and any sections of internal white space are replaced by a single space.
1079 */
1080 static final StringBuilder appendCollapseWhiteSpace(final StringBuilder sb, final CharSequence text) {
1081 final int textLength=text.length();
1082 int i=0;
1083 boolean lastWasWhiteSpace=false;
1084 while (true) {
1085 if (i>=textLength) return sb;
1086 if (!isWhiteSpace(text.charAt(i))) break;
1087 i++;
1088 }
1089 do {
1090 final char ch=text.charAt(i++);
1091 if (isWhiteSpace(ch)) {
1092 lastWasWhiteSpace=true;
1093 } else {
1094 if (lastWasWhiteSpace) {
1095 sb.append(' ');
1096 lastWasWhiteSpace=false;
1097 }
1098 sb.append(ch);
1099 }
1100 } while (i<textLength);
1101 return sb;
1102 }
1103
1104 static final Pattern getClassPattern(final String className) {
1105 return Pattern.compile(".*(\\s|^)"+className+"(\\s|$).*",Pattern.DOTALL);
1106 }
1107
1108 private List<Element> getAllElements(final List<StartTag> startTags) {
1109 if (startTags.isEmpty()) return Collections.emptyList();
1110 final ArrayList<Element> elements=new ArrayList<Element>(startTags.size());
1111 for (StartTag startTag : startTags) {
1112 final Element element=startTag.getElement();
1113 if (element.end<=end) elements.add(element);
1114 }
1115 return elements;
1116 }
1117
1118 private StartTag checkEnclosure(final StartTag startTag) {
1119 if (startTag==null || startTag.end>end) return null;
1120 return startTag;
1121 }
1122
1123 private Tag checkTagEnclosure(final Tag tag) {
1124 if (tag==null || tag.end>end) return null;
1125 return tag;
1126 }
1127
1128 private CharacterReference getNextCharacterReference(final int pos) {
1129 final CharacterReference characterReference=source.getNextCharacterReference(pos);
1130 if (characterReference==null || characterReference.end>end) return null;
1131 return characterReference;
1132 }
1133 }
1134

   
Visit the aagtl Website