/[aagtl_public1]/src/net/htmlparser/jericho/Segment.java
aagtl

Contents of /src/net/htmlparser/jericho/Segment.java

Parent Directory Parent Directory | Revision Log Revision Log


Revision 2 - (hide annotations) (download)
Sun Aug 5 13:48:36 2012 UTC (11 years, 7 months ago) by zoffadmin
File size: 59598 byte(s)
initial import of aagtl source code
1 zoffadmin 2 // Jericho HTML Parser - Java based library for analysing and manipulating HTML
2     // Version 3.2
3     // Copyright (C) 2004-2009 Martin Jericho
4     // http://jericho.htmlparser.net/
5     //
6     // This library is free software; you can redistribute it and/or
7     // modify it under the terms of either one of the following licences:
8     //
9     // 1. The Eclipse Public License (EPL) version 1.0,
10     // included in this distribution in the file licence-epl-1.0.html
11     // or available at http://www.eclipse.org/legal/epl-v10.html
12     //
13     // 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
14     // included in this distribution in the file licence-lgpl-2.1.txt
15     // or available at http://www.gnu.org/licenses/lgpl.txt
16     //
17     // This library is distributed on an "AS IS" basis,
18     // WITHOUT WARRANTY OF ANY KIND, either express or implied.
19     // See the individual licence texts for more details.
20    
21     package net.htmlparser.jericho;
22    
23     import java.util.Iterator;
24     import java.util.List;
25     import java.util.Collections;
26     import java.util.ArrayList;
27     import java.util.regex.Pattern;
28    
29     /**
30     * Represents a segment of a {@link Source} document.
31     * <p>
32     * Many of the <a href="Tag.html#TagSearchMethods">tag search methods</a> are defined in this class.
33     * <p>
34     * The <i>span</i> of a segment is defined by the combination of its begin and end character positions.
35     */
36     public class Segment implements Comparable<Segment>, CharSequence {
37     final int begin;
38     final int end;
39     final Source source;
40    
41     private static final char[] WHITESPACE={' ','\n','\r','\t','\f','\u200B'}; // see comments in isWhiteSpace(char) method
42    
43     /**
44     * Constructs a new <code>Segment</code> within the specified {@linkplain Source source} document with the specified begin and end character positions.
45     * @param source the {@link Source} document, must not be <code>null</code>.
46     * @param begin the character position in the source where this segment {@linkplain #getBegin() begins}, inclusive.
47     * @param end the character position in the source where this segment {@linkplain #getEnd() ends}, exclusive.
48     */
49     public Segment(final Source source, final int begin, final int end) {
50     if (begin==-1 || end==-1 || begin>end) throw new IllegalArgumentException();
51     this.begin=begin;
52     this.end=end;
53     if (source==null) throw new IllegalArgumentException("source argument must not be null");
54     this.source=source;
55     }
56    
57     // Only called from Source constructor
58     Segment(final int length) {
59     begin=0;
60     this.end=length;
61     source=(Source)this;
62     }
63    
64     // Only used for creating dummy flag instances of this type (see Tag.NOT_CACHED and Element.NOT_CACHED)
65     Segment() {
66     this(0,0);
67     }
68    
69     // Only used for creating dummy flag instances of this type (see Segment() constructor and StreamedSource.START_SEGMENT)
70     Segment(final int begin, final int end) {
71     this.begin=begin;
72     this.end=end;
73     source=null;
74     }
75    
76     /**
77     * Returns the {@link Source} document containing this segment.
78     * <p>
79     * If a {@link StreamedSource} is in use, this method throws an <code>UnsupportedOperationException</code>.
80     *
81     * @return the {@link Source} document containing this segment.
82     */
83     public final Source getSource() {
84     if (source.isStreamed()) throw new UnsupportedOperationException("Source object is not available when using StreamedSource");
85     return source;
86     }
87    
88     /**
89     * Returns the character position in the {@link Source} document at which this segment begins, inclusive.
90     * <p>
91     * Use the {@link Source#getRowColumnVector(int pos)} method to determine the row and column numbers corresponding to this character position.
92     *
93     * @return the character position in the {@link Source} document at which this segment begins, inclusive.
94     */
95     public final int getBegin() {
96     return begin;
97     }
98    
99     /**
100     * Returns the character position in the {@link Source} document immediately after the end of this segment.
101     * <p>
102     * The character at the position specified by this property is <b>not</b> included in the segment.
103     *
104     * @return the character position in the {@link Source} document immediately after the end of this segment.
105     * @see #getBegin()
106     */
107     public final int getEnd() {
108     return end;
109     }
110    
111     /**
112     * Compares the specified object with this <code>Segment</code> for equality.
113     * <p>
114     * Returns <code>true</code> if and only if the specified object is also a <code>Segment</code>,
115     * and both segments have the same {@link Source}, and the same begin and end positions.
116     * @param object the object to be compared for equality with this <code>Segment</code>.
117     * @return <code>true</code> if the specified object is equal to this <code>Segment</code>, otherwise <code>false</code>.
118     */
119     public final boolean equals(final Object object) {
120     if (this==object) return true;
121     if (object==null || !(object instanceof Segment)) return false;
122     final Segment segment=(Segment)object;
123     return segment.begin==begin && segment.end==end && segment.source==source;
124     }
125    
126     /**
127     * Returns a hash code value for the segment.
128     * <p>
129     * The current implementation returns the sum of the begin and end positions, although this is not
130     * guaranteed in future versions.
131     *
132     * @return a hash code value for the segment.
133     */
134     public int hashCode() {
135     return begin+end;
136     }
137    
138     /**
139     * Returns the length of the segment.
140     * This is defined as the number of characters between the begin and end positions.
141     * @return the length of the segment.
142     */
143     public int length() {
144     return end-begin;
145     }
146    
147     /**
148     * Indicates whether this <code>Segment</code> encloses the specified <code>Segment</code>.
149     * <p>
150     * This is the case if {@link #getBegin()}<code>&lt;=segment.</code>{@link #getBegin()}<code> &amp;&amp; </code>{@link #getEnd()}<code>&gt;=segment.</code>{@link #getEnd()}.
151     * <p>
152     * Note that a segment encloses itself.
153     *
154     * @param segment the segment to be tested for being enclosed by this segment.
155     * @return <code>true</code> if this <code>Segment</code> encloses the specified <code>Segment</code>, otherwise <code>false</code>.
156     */
157     public final boolean encloses(final Segment segment) {
158     return begin<=segment.begin && end>=segment.end;
159     }
160    
161     /**
162     * Indicates whether this segment encloses the specified character position in the source document.
163     * <p>
164     * This is the case if {@link #getBegin()}<code> &lt;= pos &lt; </code>{@link #getEnd()}.
165     *
166     * @param pos the position in the {@link Source} document.
167     * @return <code>true</code> if this segment encloses the specified character position in the source document, otherwise <code>false</code>.
168     */
169     public final boolean encloses(final int pos) {
170     return begin<=pos && pos<end;
171     }
172    
173     /**
174     * Returns the source text of this segment as a <code>String</code>.
175     * <p>
176     * The returned <code>String</code> is newly created with every call to this method, unless this
177     * segment is itself an instance of {@link Source}.
178     *
179     * @return the source text of this segment as a <code>String</code>.
180     */
181     public String toString() {
182     return source.subSequence(begin,end).toString();
183     }
184    
185     /**
186     * Performs a simple rendering of the HTML markup in this segment into text.
187     * <p>
188     * The output can be configured by setting any number of properties on the returned {@link Renderer} instance before
189     * {@linkplain Renderer#writeTo(Writer) obtaining its output}.
190     *
191     * @return an instance of {@link Renderer} based on this segment.
192     * @see #getTextExtractor()
193     */
194     public Renderer getRenderer() {
195     return new Renderer(this);
196     }
197    
198     /**
199     * Extracts the textual content from the HTML markup of this segment.
200     * <p>
201     * The output can be configured by setting properties on the returned {@link TextExtractor} instance before
202     * {@linkplain TextExtractor#writeTo(Writer) obtaining its output}.
203     * <p>
204     * @return an instance of {@link TextExtractor} based on this segment.
205     * @see #getRenderer()
206     */
207     public TextExtractor getTextExtractor() {
208     return new TextExtractor(this);
209     }
210    
211     /**
212     * Returns an iterator over every {@linkplain Tag tag}, {@linkplain CharacterReference character reference} and <a href="Source.html#PlainText">plain text</a> segment contained within this segment.
213     * <p>
214     * See the {@link Source#iterator()} method for a detailed description.
215     * <p>
216     * <dl>
217     * <dt>Example:</dt>
218     * <dd>
219     * <p>
220     * The following code demonstrates the typical usage of this method to make an exact copy of this segment to <code>writer</code> (assuming no server tags are present):
221     * </p>
222     * <pre>
223     * for (Iterator&lt;Segment&gt; nodeIterator=segment.getNoteIterator(); nodeIterator.hasNext();) {
224     * Segment nodeSegment=nodeIterator.next();
225     * if (nodeSegment instanceof Tag) {
226     * Tag tag=(Tag)nodeSegment;
227     * // HANDLE TAG
228     * // Uncomment the following line to ensure each tag is valid XML:
229     * // writer.write(tag.tidy()); continue;
230     * } else if (nodeSegment instanceof CharacterReference) {
231     * CharacterReference characterReference=(CharacterReference)nodeSegment;
232     * // HANDLE CHARACTER REFERENCE
233     * // Uncomment the following line to decode all character references instead of copying them verbatim:
234     * // characterReference.appendCharTo(writer); continue;
235     * } else {
236     * // HANDLE PLAIN TEXT
237     * }
238     * // unless specific handling has prevented getting to here, simply output the segment as is:
239     * writer.write(nodeSegment.toString());
240     * }</pre>
241     * </dd>
242     * </dl>
243     * @return an iterator over every {@linkplain Tag tag}, {@linkplain CharacterReference character reference} and <a href="Source.html#PlainText">plain text</a> segment contained within this segment.
244     */
245     public Iterator<Segment> getNodeIterator() {
246     return new NodeIterator(this);
247     }
248    
249     /**
250     * Returns a list of all {@link Tag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
251     * <p>
252     * The {@link Source#fullSequentialParse()} method should be called after construction of the {@link Source} object
253     * if this method is to be used on a large proportion of the source.
254     * It is called automatically if this method is called on the {@link Source} object itself.
255     * <p>
256     * See the {@link Tag} class documentation for more details about the behaviour of this method.
257     *
258     * @return a list of all {@link Tag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
259     */
260     public List<Tag> getAllTags() {
261     return getAllTags(null);
262     }
263    
264     /**
265     * Returns a list of all {@link Tag} objects of the specified {@linkplain TagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
266     * <p>
267     * See the {@link Tag} class documentation for more details about the behaviour of this method.
268     * <p>
269     * Specifying a <code>null</code> argument to the <code>tagType</code> parameter is equivalent to {@link #getAllTags()}.
270     *
271     * @param tagType the {@linkplain TagType type} of tags to get.
272     * @return a list of all {@link Tag} objects of the specified {@linkplain TagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
273     * @see #getAllStartTags(StartTagType)
274     */
275     public List<Tag> getAllTags(final TagType tagType) {
276     Tag tag=checkTagEnclosure(Tag.getNextTag(source,begin,tagType));
277     if (tag==null) return Collections.emptyList();
278     final ArrayList<Tag> list=new ArrayList<Tag>();
279     do {
280     list.add(tag);
281     tag=checkTagEnclosure(tag.getNextTag(tagType));
282     } while (tag!=null);
283     return list;
284     }
285    
286     /**
287     * Returns a list of all {@link StartTag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
288     * <p>
289     * The {@link Source#fullSequentialParse()} method should be called after construction of the {@link Source} object
290     * if this method is to be used on a large proportion of the source.
291     * It is called automatically if this method is called on the {@link Source} object itself.
292     * <p>
293     * See the {@link Tag} class documentation for more details about the behaviour of this method.
294     *
295     * @return a list of all {@link StartTag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
296     */
297     public List<StartTag> getAllStartTags() {
298     StartTag startTag=checkEnclosure(StartTag.getNext(source,begin));
299     if (startTag==null) return Collections.emptyList();
300     final ArrayList<StartTag> list=new ArrayList<StartTag>();
301     do {
302     list.add(startTag);
303     startTag=checkEnclosure(startTag.getNextStartTag());
304     } while (startTag!=null);
305     return list;
306     }
307    
308     /**
309     * Returns a list of all {@link StartTag} objects of the specified {@linkplain StartTagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
310     * <p>
311     * See the {@link Tag} class documentation for more details about the behaviour of this method.
312     * <p>
313     * Specifying a <code>null</code> argument to the <code>startTagType</code> parameter is equivalent to {@link #getAllStartTags()}.
314     *
315     * @param startTagType the {@linkplain StartTagType type} of tags to get.
316     * @return a list of all {@link StartTag} objects of the specified {@linkplain StartTagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
317     */
318     public List<StartTag> getAllStartTags(final StartTagType startTagType) {
319     if (startTagType==null) return getAllStartTags();
320     StartTag startTag=(StartTag)checkTagEnclosure(Tag.getNextTag(source,begin,startTagType));
321     if (startTag==null) return Collections.emptyList();
322     final ArrayList<StartTag> list=new ArrayList<StartTag>();
323     do {
324     list.add(startTag);
325     startTag=(StartTag)checkTagEnclosure(startTag.getNextTag(startTagType));
326     } while (startTag!=null);
327     return list;
328     }
329    
330     /**
331     * Returns a list of all {@linkplain StartTagType#NORMAL normal} {@link StartTag} objects with the specified {@linkplain StartTag#getName() name} that are {@linkplain #encloses(Segment) enclosed} by this segment.
332     * <p>
333     * See the {@link Tag} class documentation for more details about the behaviour of this method.
334     * <p>
335     * Specifying a <code>null</code> argument to the <code>name</code> parameter is equivalent to {@link #getAllStartTags()}, which may include non-{@linkplain StartTagType#NORMAL normal} start tags.
336     * <p>
337     * This method also returns {@linkplain Tag#isUnregistered() unregistered} tags if the specified name is not a valid {@linkplain Tag#isXMLName(CharSequence) XML tag name}.
338     *
339     * @param name the {@linkplain StartTag#getName() name} of the start tags to get.
340     * @return a list of all {@linkplain StartTagType#NORMAL normal} {@link StartTag} objects with the specified {@linkplain StartTag#getName() name} that are {@linkplain #encloses(Segment) enclosed} by this segment.
341     */
342     public List<StartTag> getAllStartTags(String name) {
343     if (name==null) return getAllStartTags();
344     final boolean isXMLTagName=Tag.isXMLName(name);
345     StartTag startTag=checkEnclosure(StartTag.getNext(source,begin,name,StartTagType.NORMAL,isXMLTagName));
346     if (startTag==null) return Collections.emptyList();
347     final ArrayList<StartTag> list=new ArrayList<StartTag>();
348     do {
349     list.add(startTag);
350     startTag=checkEnclosure(StartTag.getNext(source,startTag.begin+1,name,StartTagType.NORMAL,isXMLTagName));
351     } while (startTag!=null);
352     return list;
353     }
354    
355     /**
356     * Returns a list of all {@link StartTag} objects with the specified attribute name/value pair that are {@linkplain #encloses(Segment) enclosed} by this segment.
357     * <p>
358     * See the {@link Tag} class documentation for more details about the behaviour of this method.
359     *
360     * @param attributeName the attribute name (case insensitive) to search for, must not be <code>null</code>.
361     * @param value the value of the specified attribute to search for, must not be <code>null</code>.
362     * @param valueCaseSensitive specifies whether the attribute value matching is case sensitive.
363     * @return a list of all {@link StartTag} objects with the specified attribute name/value pair that are {@linkplain #encloses(Segment) enclosed} by this segment.
364     * @see #getAllStartTags(String attributeName, Pattern valueRegexPattern)
365     */
366     public List<StartTag> getAllStartTags(final String attributeName, final String value, final boolean valueCaseSensitive) {
367     StartTag startTag=checkEnclosure(source.getNextStartTag(begin,attributeName,value,valueCaseSensitive));
368     if (startTag==null) return Collections.emptyList();
369     final ArrayList<StartTag> list=new ArrayList<StartTag>();
370     do {
371     list.add(startTag);
372     startTag=checkEnclosure(source.getNextStartTag(startTag.begin+1,attributeName,value,valueCaseSensitive));
373     } while (startTag!=null);
374     return list;
375     }
376    
377     /**
378     * Returns a list of all {@link StartTag} objects with the specified attribute name and value pattern that are {@linkplain #encloses(Segment) enclosed} by this segment.
379     * <p>
380     * Specifying a <code>null</code> argument to the <code>valueRegexPattern</code> parameter performs the search on the attribute name only,
381     * without regard to the attribute value. This will also match an attribute that {@linkplain Attribute#hasValue() has no value} at all.
382     * <p>
383     * See the {@link Tag} class documentation for more details about the behaviour of this method.
384     *
385     * @param attributeName the attribute name (case insensitive) to search for, must not be <code>null</code>.
386     * @param valueRegexPattern the regular expression pattern that must match the attribute value, may be <code>null</code>.
387     * @return a list of all {@link StartTag} objects with the specified attribute name and value pattern that are {@linkplain #encloses(Segment) enclosed} by this segment.
388     * @see #getAllStartTags(String attributeName, String value, boolean valueCaseSensitive)
389     */
390     public List<StartTag> getAllStartTags(final String attributeName, final Pattern valueRegexPattern) {
391     StartTag startTag=checkEnclosure(source.getNextStartTag(begin,attributeName,valueRegexPattern));
392     if (startTag==null) return Collections.emptyList();
393     final ArrayList<StartTag> list=new ArrayList<StartTag>();
394     do {
395     list.add(startTag);
396     startTag=checkEnclosure(source.getNextStartTag(startTag.begin+1,attributeName,valueRegexPattern));
397     } while (startTag!=null);
398     return list;
399     }
400    
401     /**
402     * Returns a list of all {@link StartTag} objects with the specified class that are {@linkplain #encloses(Segment) enclosed} by this segment.
403     * <p>
404     * This matches start tags with a <code>class</code> attribute that contains the specified class name, either as an exact match or where the specified class name is one of multiple
405     * class names separated by white space in the attribute value.
406     * <p>
407     * See the {@link Tag} class documentation for more details about the behaviour of this method.
408     *
409     * @param className the class name (case sensitive) to search for, must not be <code>null</code>.
410     * @return a list of all {@link StartTag} objects with the specified class that are {@linkplain #encloses(Segment) enclosed} by this segment.
411     */
412     public List<StartTag> getAllStartTagsByClass(final String className) {
413     return getAllStartTags("class",getClassPattern(className));
414     }
415    
416     /**
417     * Returns a list of the immediate children of this segment in the document element hierarchy.
418     * <p>
419     * The returned list may include an element that extends beyond the end of this segment, as long as it begins within this segment.
420     * <p>
421     * An element found at the start of this segment is included in the list.
422     * Note however that if this segment <i>is</i> an {@link Element}, the overriding {@link Element#getChildElements()} method is called instead,
423     * which only returns the children of the element.
424     * <p>
425     * Calling <code>getChildElements()</code> on an <code>Element</code> is much more efficient than calling it on a <code>Segment</code>.
426     * <p>
427     * The objects in the list are all of type {@link Element}.
428     * <p>
429     * The {@link Source#fullSequentialParse()} method should be called after construction of the {@link Source} object
430     * if this method is to be used on a large proportion of the source.
431     * It is called automatically if this method is called on the {@link Source} object itself.
432     * <p>
433     * See the {@link Source#getChildElements()} method for more details.
434     *
435     * @return the a list of the immediate children of this segment in the document element hierarchy, guaranteed not <code>null</code>.
436     * @see Element#getParentElement()
437     */
438     public List<Element> getChildElements() {
439     if (length()==0) return Collections.emptyList();
440     List<Element> childElements=new ArrayList<Element>();
441     int pos=begin;
442     while (true) {
443     final StartTag childStartTag=source.getNextStartTag(pos);
444     if (childStartTag==null || childStartTag.begin>=end) break;
445     if (!Config.IncludeServerTagsInElementHierarchy && childStartTag.getTagType().isServerTag()) {
446     pos=childStartTag.end;
447     continue;
448     }
449     final Element childElement=childStartTag.getElement();
450     childElements.add(childElement);
451     childElement.getChildElements();
452     pos=childElement.end;
453     }
454     return childElements;
455     }
456    
457     /**
458     * Returns a list of all {@link Element} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
459     * <p>
460     * The {@link Source#fullSequentialParse()} method should be called after construction of the {@link Source} object
461     * if this method is to be used on a large proportion of the source.
462     * It is called automatically if this method is called on the {@link Source} object itself.
463     * <p>
464     * The elements returned correspond exactly with the start tags returned in the {@link #getAllStartTags()} method.
465     * <p>
466     * If this segment is itself an {@link Element}, the result includes this element in the list.
467     *
468     * @return a list of all {@link Element} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
469     */
470     public List<Element> getAllElements() {
471     return getAllElements(getAllStartTags());
472     }
473    
474     /**
475     * Returns a list of all {@link Element} objects with the specified name that are {@linkplain #encloses(Segment) enclosed} by this segment.
476     * <p>
477     * The elements returned correspond with the start tags returned in the {@link #getAllStartTags(String name)} method,
478     * except that elements which are not entirely enclosed by this segment are excluded.
479     * <p>
480     * Specifying a <code>null</code> argument to the <code>name</code> parameter is equivalent to {@link #getAllElements()}, which may include elements of non-{@linkplain StartTagType#NORMAL normal} tags.
481     * <p>
482     * This method also returns elements consisting of {@linkplain Tag#isUnregistered() unregistered} tags if the specified name is not a valid {@linkplain Tag#isXMLName(CharSequence) XML tag name}.
483     * <p>
484     * If this segment is itself an {@link Element} with the specified name, the result includes this element in the list.
485     *
486     * @param name the {@linkplain Element#getName() name} of the elements to get.
487     * @return a list of all {@link Element} objects with the specified name that are {@linkplain #encloses(Segment) enclosed} by this segment.
488     */
489     public List<Element> getAllElements(String name) {
490     return getAllElements(getAllStartTags(name));
491     }
492    
493     /**
494     * Returns a list of all {@link Element} objects with start tags of the specified {@linkplain StartTagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
495     * <p>
496     * The elements returned correspond with the start tags returned in the {@link #getAllTags(TagType)} method,
497     * except that elements which are not entirely enclosed by this segment are excluded.
498     * <p>
499     * If this segment is itself an {@link Element} with the specified type, the result includes this element in the list.
500     *
501     * @param startTagType the {@linkplain StartTagType type} of start tags to get, must not be <code>null</code>.
502     * @return a list of all {@link Element} objects with start tags of the specified {@linkplain StartTagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
503     */
504     public List<Element> getAllElements(final StartTagType startTagType) {
505     if (startTagType==null) throw new IllegalArgumentException("startTagType argument must not be null");
506     return getAllElements(getAllStartTags(startTagType));
507     }
508    
509     /**
510     * Returns a list of all {@link Element} objects with the specified attribute name/value pair that are {@linkplain #encloses(Segment) enclosed} by this segment.
511     * <p>
512     * The elements returned correspond with the start tags returned in the {@link #getAllStartTags(String attributeName, String value, boolean valueCaseSensitive)} method,
513     * except that elements which are not entirely enclosed by this segment are excluded.
514     * <p>
515     * If this segment is itself an {@link Element} with the specified name/value pair, the result includes this element in the list.
516     *
517     * @param attributeName the attribute name (case insensitive) to search for, must not be <code>null</code>.
518     * @param value the value of the specified attribute to search for, must not be <code>null</code>.
519     * @param valueCaseSensitive specifies whether the attribute value matching is case sensitive.
520     * @return a list of all {@link Element} objects with the specified attribute name/value pair that are {@linkplain #encloses(Segment) enclosed} by this segment.
521     * @see #getAllElements(String attributeName, Pattern valueRegexPattern)
522     */
523     public List<Element> getAllElements(final String attributeName, final String value, final boolean valueCaseSensitive) {
524     return getAllElements(getAllStartTags(attributeName,value,valueCaseSensitive));
525     }
526    
527     /**
528     * Returns a list of all {@link Element} objects with the specified attribute name and value pattern that are {@linkplain #encloses(Segment) enclosed} by this segment.
529     * <p>
530     * The elements returned correspond with the start tags returned in the {@link #getAllStartTags(String attributeName, Pattern valueRegexPattern)} method,
531     * except that elements which are not entirely enclosed by this segment are excluded.
532     * <p>
533     * Specifying a <code>null</code> argument to the <code>valueRegexPattern</code> parameter performs the search on the attribute name only,
534     * without regard to the attribute value. This will also match an attribute that {@linkplain Attribute#hasValue() has no value} at all.
535     * <p>
536     * If this segment is itself an {@link Element} with the specified attribute name and value pattern, the result includes this element in the list.
537     *
538     * @param attributeName the attribute name (case insensitive) to search for, must not be <code>null</code>.
539     * @param valueRegexPattern the regular expression pattern that must match the attribute value, may be <code>null</code>.
540     * @return a list of all {@link Element} objects with the specified attribute name and value pattern that are {@linkplain #encloses(Segment) enclosed} by this segment.
541     * @see #getAllElements(String attributeName, String value, boolean valueCaseSensitive)
542     */
543     public List<Element> getAllElements(final String attributeName, final Pattern valueRegexPattern) {
544     return getAllElements(getAllStartTags(attributeName,valueRegexPattern));
545     }
546    
547     /**
548     * Returns a list of all {@link Element} objects with the specified class that are {@linkplain #encloses(Segment) enclosed} by this segment.
549     * <p>
550     * This matches elements with a <code>class</code> attribute that contains the specified class name, either as an exact match or where the specified class name is one of multiple
551     * class names separated by white space in the attribute value.
552     * <p>
553     * The elements returned correspond with the start tags returned in the {@link #getAllStartTagsByClass(String className)} method,
554     * except that elements which are not entirely enclosed by this segment are excluded.
555     * <p>
556     * If this segment is itself an {@link Element} with the specified class, the result includes this element in the list.
557     *
558     * @param className the class name (case sensitive) to search for, must not be <code>null</code>.
559     * @return a list of all {@link Element} objects with the specified class that are {@linkplain #encloses(Segment) enclosed} by this segment.
560     */
561     public List<Element> getAllElementsByClass(final String className) {
562     return getAllElements(getAllStartTagsByClass(className));
563     }
564    
565     /**
566     * Returns a list of all {@link CharacterReference} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
567     * @return a list of all {@link CharacterReference} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
568     */
569     public List<CharacterReference> getAllCharacterReferences() {
570     CharacterReference characterReference=getNextCharacterReference(begin);
571     if (characterReference==null) return Collections.emptyList();
572     final ArrayList<CharacterReference> list=new ArrayList<CharacterReference>();
573     do {
574     list.add(characterReference);
575     characterReference=getNextCharacterReference(characterReference.end);
576     } while (characterReference!=null);
577     return list;
578     }
579    
580     /**
581     * Returns a list of all {@linkplain Attribute attributes} {@linkplain #encloses(Segment) enclosed} by this segment that have <a target="_blank" href="http://en.wikipedia.org/wiki/URI">URI</a> values.
582     * <p>
583     * According to the <a target="_blank" href="http://www.w3.org/TR/html401/">HTML 4.01 specification</a>, the following attributes have URI values:
584     * <table class="bordered" cellspacing="0">
585     * <tr><th>HTML element name<th>Attribute name
586     * <tr><td>{@link HTMLElementName#A A}<td>href
587     * <tr><td>{@link HTMLElementName#APPLET APPLET}<td>codebase
588     * <tr><td>{@link HTMLElementName#AREA AREA}<td>href
589     * <tr><td>{@link HTMLElementName#BASE BASE}<td>href
590     * <tr><td>{@link HTMLElementName#BLOCKQUOTE BLOCKQUOTE}<td>cite
591     * <tr><td>{@link HTMLElementName#BODY BODY}<td>background
592     * <tr><td>{@link HTMLElementName#FORM FORM}<td>action
593     * <tr><td>{@link HTMLElementName#FRAME FRAME}<td>longdesc
594     * <tr><td>{@link HTMLElementName#FRAME FRAME}<td>src
595     * <tr><td>{@link HTMLElementName#DEL DEL}<td>cite
596     * <tr><td>{@link HTMLElementName#HEAD HEAD}<td>profile
597     * <tr><td>{@link HTMLElementName#IFRAME IFRAME}<td>longdesc
598     * <tr><td>{@link HTMLElementName#IFRAME IFRAME}<td>src
599     * <tr><td>{@link HTMLElementName#IMG IMG}<td>longdesc
600     * <tr><td>{@link HTMLElementName#IMG IMG}<td>src
601     * <tr><td>{@link HTMLElementName#IMG IMG}<td>usemap
602     * <tr><td>{@link HTMLElementName#INPUT INPUT}<td>src
603     * <tr><td>{@link HTMLElementName#INPUT INPUT}<td>usemap
604     * <tr><td>{@link HTMLElementName#INS INS}<td>cite
605     * <tr><td>{@link HTMLElementName#LINK LINK}<td>href
606     * <tr><td>{@link HTMLElementName#OBJECT OBJECT}<td>classid
607     * <tr><td>{@link HTMLElementName#OBJECT OBJECT}<td>codebase
608     * <tr><td>{@link HTMLElementName#OBJECT OBJECT}<td>data
609     * <tr><td>{@link HTMLElementName#OBJECT OBJECT}<td>usemap
610     * <tr><td>{@link HTMLElementName#Q Q}<td>cite
611     * <tr><td>{@link HTMLElementName#SCRIPT SCRIPT}<td>src
612     * </table>
613     * <p>
614     * Attributes from other elements may also be returned if the attribute name matches one of those in the list above.
615     * <p>
616     * This method is often used in conjunction with the {@link #getStyleURISegments()} method in order to find all URIs in a document.
617     * <p>
618     * The attributes are returned in order of appearance.
619     *
620     * @return a list of all {@linkplain Attribute attributes} {@linkplain #encloses(Segment) enclosed} by this segment that have <a target="_blank" href="http://en.wikipedia.org/wiki/URI">URI</a> values.
621     * @see #getStyleURISegments()
622     */
623     public List<Attribute> getURIAttributes() {
624     return URIAttributes.getList(this);
625     }
626    
627     /**
628     * Returns a list of all <a target="_blank" href="http://en.wikipedia.org/wiki/URI">URI</a> {@linkplain Segment segments}
629     * inside <code>style</code> attribute values {@linkplain #encloses(Segment) enclosed} by this segment.
630     * <p>
631     * If this segment does not contain any tags, the entire segment is assumed to be a <code>style</code> attribute value.
632     * <p>
633     * The URI segments are found by searching the <code>style</code> attribute values for the functional notation "<code>url()</code>" as described in
634     * <a target="_blank" href="http://www.w3.org/TR/CSS2/syndata.html#uri">section 4.3.4 of the CSS2 specification</a>.
635     * <p>
636     * The segments are returned in order of appearance.
637     *
638     * @return a list of all <a target="_blank" href="http://en.wikipedia.org/wiki/URI">URI</a> {@linkplain Segment segments} inside <code>style</code> attribute values {@linkplain #encloses(Segment) enclosed} by this segment.
639     * @see #getURIAttributes()
640     */
641     public List<Segment> getStyleURISegments() {
642     return URIAttributes.getStyleURISegments(this);
643     }
644    
645     /**
646     * Returns the first {@link StartTag} {@linkplain #encloses(Segment) enclosed} by this segment.
647     * <p>
648     * This is functionally equivalent to {@link #getAllStartTags()}<code>.iterator().next()</code>,
649     * but does not search beyond the first start tag and returns <code>null</code> if no such start tag exists.
650     *
651     * @return the first {@link StartTag} {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
652     */
653     public final StartTag getFirstStartTag() {
654     return checkEnclosure(source.getNextStartTag(begin));
655     }
656    
657     /**
658     * Returns the first {@link StartTag} of the specified {@linkplain StartTagType type} {@linkplain #encloses(Segment) enclosed} by this segment.
659     * <p>
660     * This is functionally equivalent to {@link #getAllStartTags(StartTagType) getAllStartTags(startTagType)}<code>.iterator().next()</code>,
661     * but does not search beyond the first start tag and returns <code>null</code> if no such start tag exists.
662     *
663     * @param startTagType the <code>StartTagType</code> to search for.
664     * @return the first {@link StartTag} of the specified {@linkplain StartTagType type} {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
665     */
666     public final StartTag getFirstStartTag(StartTagType startTagType) {
667     return checkEnclosure(source.getNextStartTag(begin,startTagType));
668     }
669    
670     /**
671     * Returns the first {@linkplain StartTagType#NORMAL normal} {@link StartTag} {@linkplain #encloses(Segment) enclosed} by this segment.
672     * <p>
673     * This is functionally equivalent to {@link #getAllStartTags(String) getAllStartTags(name)}<code>.iterator().next()</code>,
674     * but does not search beyond the first start tag and returns <code>null</code> if no such start tag exists.
675     * <p>
676     * Specifying a <code>null</code> argument to the <code>name</code> parameter is equivalent to {@link #getFirstStartTag()}.
677     *
678     * @param name the {@linkplain StartTag#getName() name} of the start tag to search for, may be <code>null</code>.
679     * @return the first {@linkplain StartTagType#NORMAL normal} {@link StartTag} {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
680     */
681     public final StartTag getFirstStartTag(String name) {
682     return checkEnclosure(source.getNextStartTag(begin,name));
683     }
684    
685     /**
686     * Returns the first {@link StartTag} with the specified attribute name/value pair {@linkplain #encloses(Segment) enclosed} by this segment.
687     * <p>
688     * This is functionally equivalent to {@link #getAllStartTags(String,String,boolean) getAllStartTags(attributeName,value,valueCaseSensitive)}<code>.iterator().next()</code>,
689     * but does not search beyond the first start tag and returns <code>null</code> if no such start tag exists.
690     *
691     * @param attributeName the attribute name (case insensitive) to search for, must not be <code>null</code>.
692     * @param value the value of the specified attribute to search for, must not be <code>null</code>.
693     * @param valueCaseSensitive specifies whether the attribute value matching is case sensitive.
694     * @return the first {@link StartTag} with the specified attribute name/value pair {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
695     * @see #getFirstStartTag(String attributeName, Pattern valueRegexPattern)
696     */
697     public final StartTag getFirstStartTag(String attributeName, String value, boolean valueCaseSensitive) {
698     return checkEnclosure(source.getNextStartTag(begin,attributeName,value,valueCaseSensitive));
699     }
700    
701     /**
702     * Returns the first {@link StartTag} with the specified attribute name and value pattern that is {@linkplain #encloses(Segment) enclosed} by this segment.
703     * <p>
704     * This is functionally equivalent to {@link #getAllStartTags(String,Pattern) getAllStartTags(attributeName,valueRegexPattern)}<code>.iterator().next()</code>,
705     * but does not search beyond the first start tag and returns <code>null</code> if no such start tag exists.
706     *
707     * @param attributeName the attribute name (case insensitive) to search for, must not be <code>null</code>.
708     * @param valueRegexPattern the regular expression pattern that must match the attribute value, may be <code>null</code>.
709     * @return the first {@link StartTag} with the specified attribute name and value pattern that is {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
710     * @see #getFirstStartTag(String attributeName, String value, boolean valueCaseSensitive)
711     */
712     public final StartTag getFirstStartTag(final String attributeName, final Pattern valueRegexPattern) {
713     return checkEnclosure(source.getNextStartTag(begin,attributeName,valueRegexPattern));
714     }
715    
716     /**
717     * Returns the first {@link StartTag} with the specified class that is {@linkplain #encloses(Segment) enclosed} by this segment.
718     * <p>
719     * This is functionally equivalent to {@link #getAllStartTagsByClass(String) getAllStartTagsByClass(className)}<code>.iterator().next()</code>,
720     * but does not search beyond the first start tag and returns <code>null</code> if no such start tag exists.
721     *
722     * @param className the class name (case sensitive) to search for, must not be <code>null</code>.
723     * @return the first {@link StartTag} with the specified class that is {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
724     */
725     public final StartTag getFirstStartTagByClass(final String className) {
726     return checkEnclosure(source.getNextStartTagByClass(begin,className));
727     }
728    
729     /**
730     * Returns the first {@link Element} {@linkplain #encloses(Segment) enclosed} by this segment.
731     * <p>
732     * This is functionally equivalent to {@link #getAllElements()}<code>.iterator().next()</code>,
733     * but does not search beyond the first enclosed element and returns <code>null</code> if no such element exists.
734     * <p>
735     * If this segment is itself an {@link Element}, this element is returned, not the first child element.
736     *
737     * @return the first {@link Element} {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
738     */
739     public final Element getFirstElement() {
740     StartTag startTag=checkEnclosure(StartTag.getNext(source,begin));
741     while (startTag!=null) {
742     final Element element=startTag.getElement();
743     if (element.end<=end) return element;
744     startTag=checkEnclosure(startTag.getNextStartTag());
745     }
746     return null;
747     }
748    
749     /**
750     * Returns the first {@linkplain StartTagType#NORMAL normal} {@link Element} with the specified {@linkplain Element#getName() name} {@linkplain #encloses(Segment) enclosed} by this segment.
751     * <p>
752     * This is functionally equivalent to {@link #getAllElements(String) getAllElements(name)}<code>.iterator().next()</code>,
753     * but does not search beyond the first enclosed element and returns <code>null</code> if no such element exists.
754     * <p>
755     * Specifying a <code>null</code> argument to the <code>name</code> parameter is equivalent to {@link #getFirstElement()}.
756     * <p>
757     * If this segment is itself an {@link Element} with the specified name, this element is returned.
758     *
759     * @param name the {@linkplain Element#getName() name} of the element to search for.
760     * @return the first {@linkplain StartTagType#NORMAL normal} {@link Element} with the specified {@linkplain Element#getName() name} {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
761     */
762     public final Element getFirstElement(String name) {
763     if (name==null) return getFirstElement();
764     final boolean isXMLTagName=Tag.isXMLName(name);
765     StartTag startTag=checkEnclosure(StartTag.getNext(source,begin,name,StartTagType.NORMAL,isXMLTagName));
766     while (startTag!=null) {
767     final Element element=startTag.getElement();
768     if (element.end<=end) return element;
769     startTag=checkEnclosure(StartTag.getNext(source,startTag.begin+1,name,StartTagType.NORMAL,isXMLTagName));
770     }
771     return null;
772     }
773    
774     /**
775     * Returns the first {@link Element} with the specified attribute name/value pair {@linkplain #encloses(Segment) enclosed} by this segment.
776     * <p>
777     * This is functionally equivalent to {@link #getAllElements(String,String,boolean) getAllElements(attributeName,value,valueCaseSensitive)}<code>.iterator().next()</code>,
778     * but does not search beyond the first enclosed element and returns <code>null</code> if no such element exists.
779     * <p>
780     * If this segment is itself an {@link Element} with the specified attribute name/value pair, this element is returned.
781     *
782     * @param attributeName the attribute name (case insensitive) to search for, must not be <code>null</code>.
783     * @param value the value of the specified attribute to search for, must not be <code>null</code>.
784     * @param valueCaseSensitive specifies whether the attribute value matching is case sensitive.
785     * @return the first {@link Element} with the specified attribute name/value pair {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
786     * @see #getFirstElement(String attributeName, Pattern valueRegexPattern)
787     */
788     public final Element getFirstElement(String attributeName, String value, boolean valueCaseSensitive) {
789     StartTag startTag=checkEnclosure(source.getNextStartTag(begin,attributeName,value,valueCaseSensitive));
790     while (startTag!=null) {
791     final Element element=startTag.getElement();
792     if (element.end<=end) return element;
793     startTag=checkEnclosure(source.getNextStartTag(startTag.begin+1,attributeName,value,valueCaseSensitive));
794     }
795     return null;
796     }
797    
798     /**
799     * Returns the first {@link Element} with the specified attribute name and value pattern that is {@linkplain #encloses(Segment) enclosed} by this segment.
800     * <p>
801     * This is functionally equivalent to {@link #getAllElements(String,Pattern) getAllElements(attributeName,valueRegexPattern)}<code>.iterator().next()</code>,
802     * but does not search beyond the first enclosed element and returns <code>null</code> if no such element exists.
803     * <p>
804     * If this segment is itself an {@link Element} with the specified attribute name and value pattern, this element is returned.
805     *
806     * @param attributeName the attribute name (case insensitive) to search for, must not be <code>null</code>.
807     * @param valueRegexPattern the regular expression pattern that must match the attribute value, may be <code>null</code>.
808     * @return the first {@link Element} with the specified attribute name and value pattern that is {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
809     * @see #getFirstElement(String attributeName, String value, boolean valueCaseSensitive)
810     */
811     public final Element getFirstElement(final String attributeName, final Pattern valueRegexPattern) {
812     StartTag startTag=checkEnclosure(source.getNextStartTag(begin,attributeName,valueRegexPattern));
813     while (startTag!=null) {
814     final Element element=startTag.getElement();
815     if (element.end<=end) return element;
816     startTag=checkEnclosure(source.getNextStartTag(startTag.begin+1,attributeName,valueRegexPattern));
817     }
818     return null;
819     }
820    
821     /**
822     * Returns the first {@link Element} with the specified class that is {@linkplain #encloses(Segment) enclosed} by this segment.
823     * <p>
824     * This is functionally equivalent to {@link #getAllElementsByClass(String) getAllElementsByClass(className)}<code>.iterator().next()</code>,
825     * but does not search beyond the first enclosed element and returns <code>null</code> if no such element exists.
826     * <p>
827     * If this segment is itself an {@link Element} with the specified class, this element is returned.
828     *
829     * @param className the class name (case sensitive) to search for, must not be <code>null</code>.
830     * @return the first {@link Element} with the specified class that is {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
831     */
832     public final Element getFirstElementByClass(final String className) {
833     StartTag startTag=checkEnclosure(source.getNextStartTagByClass(begin,className));
834     while (startTag!=null) {
835     final Element element=startTag.getElement();
836     if (element.end<=end) return element;
837     startTag=checkEnclosure(source.getNextStartTagByClass(startTag.begin+1,className));
838     }
839     return null;
840     }
841    
842     /**
843     * Returns a list of the {@link FormControl} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
844     * @return a list of the {@link FormControl} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
845     */
846     public List<FormControl> getFormControls() {
847     return FormControl.getAll(this);
848     }
849    
850     /**
851     * Returns the {@link FormFields} object representing all form fields that are {@linkplain #encloses(Segment) enclosed} by this segment.
852     * <p>
853     * This is equivalent to {@link FormFields#FormFields(Collection) new FormFields}<code>(</code>{@link #getFormControls()}<code>)</code>.
854     *
855     * @return the {@link FormFields} object representing all form fields that are {@linkplain #encloses(Segment) enclosed} by this segment.
856     * @see #getFormControls()
857     */
858     public FormFields getFormFields() {
859     return new FormFields(getFormControls());
860     }
861    
862     /**
863     * Parses any {@link Attributes} within this segment.
864     * This method is only used in the unusual situation where attributes exist outside of a start tag.
865     * The {@link StartTag#getAttributes()} method should be used in normal situations.
866     * <p>
867     * This is equivalent to <code>source.</code>{@link Source#parseAttributes(int,int) parseAttributes}<code>(</code>{@link #getBegin()}<code>,</code>{@link #getEnd()}<code>)</code>.
868     *
869     * @return the {@link Attributes} within this segment, or <code>null</code> if too many errors occur while parsing.
870     */
871     public Attributes parseAttributes() {
872     return source.parseAttributes(begin,end);
873     }
874    
875     /**
876     * Causes the this segment to be ignored when parsing.
877     * <p>
878     * Ignored segments are treated as blank spaces by the parsing mechanism, but are included as normal text in all other functions.
879     * <p>
880     * This method was originally the only means of preventing {@linkplain TagType#isServerTag() server tags} located inside
881     * {@linkplain StartTagType#NORMAL normal} tags from interfering with the parsing of the tags
882     * (such as where an {@linkplain Attribute attribute} of a normal tag uses a server tag to dynamically set its value),
883     * as well as preventing non-server tags from being recognised inside server tags.
884     * <p>
885     * It is not necessary to use this method to ignore {@linkplain TagType#isServerTag() server tags} located inside normal tags,
886     * as the attributes parser automatically ignores any server tags.
887     * <p>
888     * It is not necessary to use this method to ignore non-server tags inside server tags, or the contents of {@link HTMLElementName#SCRIPT SCRIPT} elements,
889     * as the parser does this automatically when performing a {@linkplain Source#fullSequentialParse() full sequential parse}.
890     * <p>
891     * This leaves only very few scenarios where calling this method still provides a significant benefit.
892     * <p>
893     * One such case is where XML-style server tags are used inside {@linkplain StartTagType#NORMAL normal} tags.
894     * Here is an example using an XML-style JSP tag:
895     * <blockquote class="code"><code>&lt;a href="&lt;i18n:resource path="/Portal"/&gt;?BACK=TRUE"&gt;back&lt;/a&gt;</code></blockquote>
896     * The first double-quote of <code>"/Portal"</code> will be interpreted as the end quote for the <code>href</code> attribute,
897     * as there is no way for the parser to recognise the <code>il8n:resource</code> element as a server tag.
898     * Such use of XML-style server tags inside {@linkplain StartTagType#NORMAL normal} tags is generally seen as bad practice,
899     * but it is nevertheless valid JSP. The only way to ensure that this library is able to parse the normal tag surrounding it is to
900     * find these server tags first and call the <code>ignoreWhenParsing</code> method to ignore them before parsing the rest of the document.
901     * <p>
902     * It is important to understand the difference between ignoring the segment when parsing and removing the segment completely.
903     * Any text inside a segment that is ignored when parsing is treated by most functions as content, and as such is included in the output of
904     * tools such as {@link TextExtractor} and {@link Renderer}.
905     * <p>
906     * To remove segments completely, create an {@link OutputDocument} and call its {@link OutputDocument#remove(Segment) remove(Segment)} or
907     * {@link OutputDocument#replaceWithSpaces(int,int) replaceWithSpaces(int begin, int end)} method for each segment.
908     * Then create a new source document using {@link Source#Source(CharSequence) new Source(outputDocument.toString())}
909     * and perform the desired operations on this new source object.
910     * <p>
911     * Calling this method after the {@link Source#fullSequentialParse()} method has been called is not permitted and throws an <code>IllegalStateException</code>.
912     * <p>
913     * Any tags appearing in this segment that are found before this method is called will remain in the {@linkplain Source#getCacheDebugInfo() tag cache},
914     * and so will continue to be found by the <a href="Tag.html#TagSearchMethods">tag search methods</a>.
915     * If this is undesirable, the {@link Source#clearCache()} method can be called to remove them from the cache.
916     * Calling the {@link Source#fullSequentialParse()} method after this method clears the cache automatically.
917     * <p>
918     * For best performance, this method should be called on all segments that need to be ignored without calling
919     * any of the <a href="Tag.html#TagSearchMethods">tag search methods</a> in between.
920     *
921     * @see Source#ignoreWhenParsing(Collection segments)
922     */
923     public void ignoreWhenParsing() {
924     source.ignoreWhenParsing(begin,end);
925     }
926    
927     /**
928     * Compares this <code>Segment</code> object to another object.
929     * <p>
930     * If the argument is not a <code>Segment</code>, a <code>ClassCastException</code> is thrown.
931     * <p>
932     * A segment is considered to be before another segment if its begin position is earlier,
933     * or in the case that both segments begin at the same position, its end position is earlier.
934     * <p>
935     * Segments that begin and end at the same position are considered equal for
936     * the purposes of this comparison, even if they relate to different source documents.
937     * <p>
938     * Note: this class has a natural ordering that is inconsistent with equals.
939     * This means that this method may return zero in some cases where calling the
940     * {@link #equals(Object)} method with the same argument returns <code>false</code>.
941     *
942     * @param segment the segment to be compared
943     * @return a negative integer, zero, or a positive integer as this segment is before, equal to, or after the specified segment.
944     * @throws ClassCastException if the argument is not a <code>Segment</code>
945     */
946     public int compareTo(final Segment segment) {
947     if (this==segment) return 0;
948     if (begin<segment.begin) return -1;
949     if (begin>segment.begin) return 1;
950     if (end<segment.end) return -1;
951     if (end>segment.end) return 1;
952     return 0;
953     }
954    
955     /**
956     * Indicates whether this segment consists entirely of {@linkplain #isWhiteSpace(char) white space}.
957     * @return <code>true</code> if this segment consists entirely of {@linkplain #isWhiteSpace(char) white space}, otherwise <code>false</code>.
958     */
959     public final boolean isWhiteSpace() {
960     for (int i=begin; i<end; i++)
961     if (!isWhiteSpace(source.charAt(i))) return false;
962     return true;
963     }
964    
965     /**
966     * Returns an indication of the maximum depth of nested elements within this segment.
967     * <p>
968     * A high return value can indicate that the segment contains a large number of incorrectly nested tags that could result in a <code>StackOverflowException</code>
969     * if its content is parsed.
970     * <p>
971     * The usefulness of this method is debatable as a <code>StackOverflowException</code> is a recoverable error that can be easily caught.
972     * The use of this method to pre-detect and avoid a stack overflow may save some memory and processing resources in certain circumstances, but the cost of calling
973     * this method to check every segment or document will very often exceed any benefit.
974     * <p>
975     * It is up to the application developer to determine what return value constitutes an unreasonable level of nesting given the stack space allocated to the application
976     * and other factors.
977     * <p>
978     * Note that the return value is an approximation only and is usually greater than the actual maximum element depth that would be reported by calling the
979     * {@link Element#getDepth()} method on the most nested element.
980     *
981     * @return an indication of the maximum depth of nested elements within this segment.
982     */
983     public int getMaxDepthIndicator() {
984     int maxDepth=0;
985     int depth=0;
986     for (Tag tag : getAllTags()) {
987     if (tag instanceof StartTag) {
988     StartTag startTag=(StartTag)tag;
989     if (startTag.getStartTagType().getCorrespondingEndTagType()==null) continue;
990     if (HTMLElements.getEndTagForbiddenElementNames().contains(startTag.getName())) continue;
991     if (startTag.isEmptyElementTag()) continue;
992     depth++;
993     if (depth>maxDepth) maxDepth++;
994     } else {
995     depth--;
996     }
997     }
998     return maxDepth;
999     }
1000    
1001     /**
1002     * Indicates whether the specified character is <a target="_blank" href="http://www.w3.org/TR/html401/struct/text.html#h-9.1">white space</a>.
1003     * <p>
1004     * The <a target="_blank" href="http://www.w3.org/TR/html401/struct/text.html#h-9.1">HTML 4.01 specification section 9.1</a>
1005     * specifies the following white space characters:
1006     * <ul>
1007     * <li>space (U+0020)
1008     * <li>tab (U+0009)
1009     * <li>form feed (U+000C)
1010     * <li>line feed (U+000A)
1011     * <li>carriage return (U+000D)
1012     * <li>zero-width space (U+200B)
1013     * </ul>
1014     * <p>
1015     * Despite the explicit inclusion of the zero-width space in the HTML specification, Microsoft IE6 does not
1016     * recognise them as white space and renders them as an unprintable character (empty square).
1017     * Even zero-width spaces included using the numeric character reference <code>&amp;#x200B;</code> are rendered this way.
1018     *
1019     * @param ch the character to test.
1020     * @return <code>true</code> if the specified character is <a target="_blank" href="http://www.w3.org/TR/html401/struct/text.html#h-9.1">white space</a>, otherwise <code>false</code>.
1021     */
1022     public static final boolean isWhiteSpace(final char ch) {
1023     for (char whiteSpaceChar : WHITESPACE) if (ch==whiteSpaceChar) return true;
1024     return false;
1025     }
1026    
1027     /**
1028     * Returns a string representation of this object useful for debugging purposes.
1029     * @return a string representation of this object useful for debugging purposes.
1030     */
1031     public String getDebugInfo() {
1032     final StringBuilder sb=new StringBuilder(50);
1033     sb.append('(');
1034     source.getRowColumnVector(begin).appendTo(sb);
1035     sb.append('-');
1036     source.getRowColumnVector(end).appendTo(sb);
1037     sb.append(')');
1038     return sb.toString();
1039     }
1040    
1041     /**
1042     * Returns the character at the specified index.
1043     * <p>
1044     * This is logically equivalent to <code>toString().charAt(index)</code>
1045     * for valid argument values <code>0 <= index < length()</code>.
1046     * <p>
1047     * However because this implementation works directly on the underlying document source string,
1048     * it should not be assumed that an <code>IndexOutOfBoundsException</code> is thrown
1049     * for an invalid argument value.
1050     *
1051     * @param index the index of the character.
1052     * @return the character at the specified index.
1053     */
1054     public char charAt(final int index) {
1055     return source.charAt(begin+index);
1056     }
1057    
1058     /**
1059     * Returns a new character sequence that is a subsequence of this sequence.
1060     * <p>
1061     * This is logically equivalent to <code>toString().subSequence(beginIndex,endIndex)</code>
1062     * for valid values of <code>beginIndex</code> and <code>endIndex</code>.
1063     * <p>
1064     * However because this implementation works directly on the underlying document source text,
1065     * it should not be assumed that an <code>IndexOutOfBoundsException</code> is thrown
1066     * for invalid argument values as described in the <code>String.subSequence(int,int)</code> method.
1067     *
1068     * @param beginIndex the begin index, inclusive.
1069     * @param endIndex the end index, exclusive.
1070     * @return a new character sequence that is a subsequence of this sequence.
1071     */
1072     public CharSequence subSequence(final int beginIndex, final int endIndex) {
1073     return source.subSequence(begin+beginIndex,begin+endIndex);
1074     }
1075    
1076     /**
1077     * Collapses the {@linkplain #isWhiteSpace(char) white space} in the specified text.
1078     * All leading and trailing white space is omitted, and any sections of internal white space are replaced by a single space.
1079     */
1080     static final StringBuilder appendCollapseWhiteSpace(final StringBuilder sb, final CharSequence text) {
1081     final int textLength=text.length();
1082     int i=0;
1083     boolean lastWasWhiteSpace=false;
1084     while (true) {
1085     if (i>=textLength) return sb;
1086     if (!isWhiteSpace(text.charAt(i))) break;
1087     i++;
1088     }
1089     do {
1090     final char ch=text.charAt(i++);
1091     if (isWhiteSpace(ch)) {
1092     lastWasWhiteSpace=true;
1093     } else {
1094     if (lastWasWhiteSpace) {
1095     sb.append(' ');
1096     lastWasWhiteSpace=false;
1097     }
1098     sb.append(ch);
1099     }
1100     } while (i<textLength);
1101     return sb;
1102     }
1103    
1104     static final Pattern getClassPattern(final String className) {
1105     return Pattern.compile(".*(\\s|^)"+className+"(\\s|$).*",Pattern.DOTALL);
1106     }
1107    
1108     private List<Element> getAllElements(final List<StartTag> startTags) {
1109     if (startTags.isEmpty()) return Collections.emptyList();
1110     final ArrayList<Element> elements=new ArrayList<Element>(startTags.size());
1111     for (StartTag startTag : startTags) {
1112     final Element element=startTag.getElement();
1113     if (element.end<=end) elements.add(element);
1114     }
1115     return elements;
1116     }
1117    
1118     private StartTag checkEnclosure(final StartTag startTag) {
1119     if (startTag==null || startTag.end>end) return null;
1120     return startTag;
1121     }
1122    
1123     private Tag checkTagEnclosure(final Tag tag) {
1124     if (tag==null || tag.end>end) return null;
1125     return tag;
1126     }
1127    
1128     private CharacterReference getNextCharacterReference(final int pos) {
1129     final CharacterReference characterReference=source.getNextCharacterReference(pos);
1130     if (characterReference==null || characterReference.end>end) return null;
1131     return characterReference;
1132     }
1133     }
1134    

   
Visit the aagtl Website