/[aagtl_public1]/src/net/htmlparser/jericho/Segment.java
aagtl

Contents of /src/net/htmlparser/jericho/Segment.java

Parent Directory Parent Directory | Revision Log Revision Log


Revision 4 - (hide annotations) (download)
Sat Aug 1 08:47:10 2015 UTC (8 years, 7 months ago) by zoffadmin
File size: 60940 byte(s)
1.0.35
1 zoffadmin 2 // Jericho HTML Parser - Java based library for analysing and manipulating HTML
2     // Version 3.2
3     // Copyright (C) 2004-2009 Martin Jericho
4     // http://jericho.htmlparser.net/
5     //
6     // This library is free software; you can redistribute it and/or
7     // modify it under the terms of either one of the following licences:
8     //
9     // 1. The Eclipse Public License (EPL) version 1.0,
10     // included in this distribution in the file licence-epl-1.0.html
11     // or available at http://www.eclipse.org/legal/epl-v10.html
12     //
13     // 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
14     // included in this distribution in the file licence-lgpl-2.1.txt
15     // or available at http://www.gnu.org/licenses/lgpl.txt
16     //
17     // This library is distributed on an "AS IS" basis,
18     // WITHOUT WARRANTY OF ANY KIND, either express or implied.
19     // See the individual licence texts for more details.
20    
21     package net.htmlparser.jericho;
22    
23 zoffadmin 4 import java.io.Writer;
24     import java.util.ArrayList;
25     import java.util.Collection;
26     import java.util.Collections;
27 zoffadmin 2 import java.util.Iterator;
28     import java.util.List;
29     import java.util.regex.Pattern;
30    
31     /**
32     * Represents a segment of a {@link Source} document.
33     * <p>
34     * Many of the <a href="Tag.html#TagSearchMethods">tag search methods</a> are defined in this class.
35     * <p>
36     * The <i>span</i> of a segment is defined by the combination of its begin and end character positions.
37     */
38 zoffadmin 4 public class Segment implements Comparable<Segment>, CharSequence
39     {
40 zoffadmin 2 final int begin;
41     final int end;
42     final Source source;
43    
44 zoffadmin 4 private static final char[] WHITESPACE = { ' ', '\n', '\r', '\t', '\f', '\u200B' }; // see comments in isWhiteSpace(char) method
45    
46 zoffadmin 2 /**
47     * Constructs a new <code>Segment</code> within the specified {@linkplain Source source} document with the specified begin and end character positions.
48 zoffadmin 4 *
49     * @param source
50     * the {@link Source} document, must not be <code>null</code>.
51     * @param begin
52     * the character position in the source where this segment {@linkplain #getBegin() begins}, inclusive.
53     * @param end
54     * the character position in the source where this segment {@linkplain #getEnd() ends}, exclusive.
55 zoffadmin 2 */
56 zoffadmin 4 public Segment(final Source source, final int begin, final int end)
57     {
58     if (begin == -1 || end == -1 || begin > end) throw new IllegalArgumentException();
59     this.begin = begin;
60     this.end = end;
61     if (source == null) throw new IllegalArgumentException("source argument must not be null");
62     this.source = source;
63 zoffadmin 2 }
64    
65     // Only called from Source constructor
66 zoffadmin 4 Segment(final int length)
67     {
68     begin = 0;
69     this.end = length;
70     source = (Source) this;
71 zoffadmin 2 }
72    
73     // Only used for creating dummy flag instances of this type (see Tag.NOT_CACHED and Element.NOT_CACHED)
74 zoffadmin 4 Segment()
75     {
76     this(0, 0);
77 zoffadmin 2 }
78    
79     // Only used for creating dummy flag instances of this type (see Segment() constructor and StreamedSource.START_SEGMENT)
80 zoffadmin 4 Segment(final int begin, final int end)
81     {
82     this.begin = begin;
83     this.end = end;
84     source = null;
85 zoffadmin 2 }
86    
87     /**
88     * Returns the {@link Source} document containing this segment.
89     * <p>
90 zoffadmin 4 * If a {@link StreamedSource} is in use, this method throws an <code>UnsupportedOperationException</code>.
91     *
92 zoffadmin 2 * @return the {@link Source} document containing this segment.
93     */
94 zoffadmin 4 public final Source getSource()
95     {
96 zoffadmin 2 if (source.isStreamed()) throw new UnsupportedOperationException("Source object is not available when using StreamedSource");
97     return source;
98     }
99    
100     /**
101     * Returns the character position in the {@link Source} document at which this segment begins, inclusive.
102     * <p>
103     * Use the {@link Source#getRowColumnVector(int pos)} method to determine the row and column numbers corresponding to this character position.
104 zoffadmin 4 *
105 zoffadmin 2 * @return the character position in the {@link Source} document at which this segment begins, inclusive.
106     */
107 zoffadmin 4 public final int getBegin()
108     {
109 zoffadmin 2 return begin;
110     }
111    
112     /**
113     * Returns the character position in the {@link Source} document immediately after the end of this segment.
114     * <p>
115     * The character at the position specified by this property is <b>not</b> included in the segment.
116 zoffadmin 4 *
117 zoffadmin 2 * @return the character position in the {@link Source} document immediately after the end of this segment.
118     * @see #getBegin()
119     */
120 zoffadmin 4 public final int getEnd()
121     {
122 zoffadmin 2 return end;
123     }
124    
125     /**
126     * Compares the specified object with this <code>Segment</code> for equality.
127     * <p>
128 zoffadmin 4 * Returns <code>true</code> if and only if the specified object is also a <code>Segment</code>, and both segments have the same {@link Source}, and the same begin and end positions.
129     *
130     * @param object
131     * the object to be compared for equality with this <code>Segment</code>.
132 zoffadmin 2 * @return <code>true</code> if the specified object is equal to this <code>Segment</code>, otherwise <code>false</code>.
133     */
134 zoffadmin 4 public final boolean equals(final Object object)
135     {
136     if (this == object) return true;
137     if (object == null || !(object instanceof Segment)) return false;
138     final Segment segment = (Segment) object;
139     return segment.begin == begin && segment.end == end && segment.source == source;
140 zoffadmin 2 }
141    
142     /**
143     * Returns a hash code value for the segment.
144     * <p>
145 zoffadmin 4 * The current implementation returns the sum of the begin and end positions, although this is not guaranteed in future versions.
146     *
147 zoffadmin 2 * @return a hash code value for the segment.
148     */
149 zoffadmin 4 public int hashCode()
150     {
151     return begin + end;
152 zoffadmin 2 }
153    
154     /**
155     * Returns the length of the segment.
156     * This is defined as the number of characters between the begin and end positions.
157 zoffadmin 4 *
158 zoffadmin 2 * @return the length of the segment.
159     */
160 zoffadmin 4 public int length()
161     {
162     return end - begin;
163 zoffadmin 2 }
164    
165     /**
166     * Indicates whether this <code>Segment</code> encloses the specified <code>Segment</code>.
167     * <p>
168     * This is the case if {@link #getBegin()}<code>&lt;=segment.</code>{@link #getBegin()}<code> &amp;&amp; </code>{@link #getEnd()}<code>&gt;=segment.</code>{@link #getEnd()}.
169     * <p>
170     * Note that a segment encloses itself.
171 zoffadmin 4 *
172     * @param segment
173     * the segment to be tested for being enclosed by this segment.
174 zoffadmin 2 * @return <code>true</code> if this <code>Segment</code> encloses the specified <code>Segment</code>, otherwise <code>false</code>.
175     */
176 zoffadmin 4 public final boolean encloses(final Segment segment)
177     {
178     return begin <= segment.begin && end >= segment.end;
179 zoffadmin 2 }
180    
181     /**
182     * Indicates whether this segment encloses the specified character position in the source document.
183     * <p>
184     * This is the case if {@link #getBegin()}<code> &lt;= pos &lt; </code>{@link #getEnd()}.
185 zoffadmin 4 *
186     * @param pos
187     * the position in the {@link Source} document.
188 zoffadmin 2 * @return <code>true</code> if this segment encloses the specified character position in the source document, otherwise <code>false</code>.
189     */
190 zoffadmin 4 public final boolean encloses(final int pos)
191     {
192     return begin <= pos && pos < end;
193 zoffadmin 2 }
194    
195     /**
196     * Returns the source text of this segment as a <code>String</code>.
197     * <p>
198 zoffadmin 4 * The returned <code>String</code> is newly created with every call to this method, unless this segment is itself an instance of {@link Source}.
199     *
200 zoffadmin 2 * @return the source text of this segment as a <code>String</code>.
201     */
202 zoffadmin 4 public String toString()
203     {
204     return source.subSequence(begin, end).toString();
205 zoffadmin 2 }
206    
207     /**
208     * Performs a simple rendering of the HTML markup in this segment into text.
209     * <p>
210 zoffadmin 4 * The output can be configured by setting any number of properties on the returned {@link Renderer} instance before {@linkplain Renderer#writeTo(Writer) obtaining its output}.
211 zoffadmin 2 *
212     * @return an instance of {@link Renderer} based on this segment.
213     * @see #getTextExtractor()
214     */
215 zoffadmin 4 public Renderer getRenderer()
216     {
217 zoffadmin 2 return new Renderer(this);
218     }
219    
220     /**
221     * Extracts the textual content from the HTML markup of this segment.
222     * <p>
223 zoffadmin 4 * The output can be configured by setting properties on the returned {@link TextExtractor} instance before {@linkplain TextExtractor#writeTo(Writer) obtaining its output}.
224 zoffadmin 2 * <p>
225 zoffadmin 4 *
226 zoffadmin 2 * @return an instance of {@link TextExtractor} based on this segment.
227     * @see #getRenderer()
228     */
229 zoffadmin 4 public TextExtractor getTextExtractor()
230     {
231 zoffadmin 2 return new TextExtractor(this);
232     }
233    
234     /**
235     * Returns an iterator over every {@linkplain Tag tag}, {@linkplain CharacterReference character reference} and <a href="Source.html#PlainText">plain text</a> segment contained within this segment.
236     * <p>
237     * See the {@link Source#iterator()} method for a detailed description.
238     * <p>
239     * <dl>
240 zoffadmin 4 * <dt>Example:</dt>
241     * <dd>
242     * <p>
243     * The following code demonstrates the typical usage of this method to make an exact copy of this segment to <code>writer</code> (assuming no server tags are present):
244     * </p>
245     *
246 zoffadmin 2 * <pre>
247 zoffadmin 4 * for (Iterator&lt;Segment&gt; nodeIterator = segment.getNoteIterator(); nodeIterator.hasNext();)
248     * {
249     * Segment nodeSegment = nodeIterator.next();
250     * if (nodeSegment instanceof Tag)
251     * {
252     * Tag tag = (Tag) nodeSegment;
253     * // HANDLE TAG
254     * // Uncomment the following line to ensure each tag is valid XML:
255     * // writer.write(tag.tidy()); continue;
256     * }
257     * else if (nodeSegment instanceof CharacterReference)
258     * {
259     * CharacterReference characterReference = (CharacterReference) nodeSegment;
260     * // HANDLE CHARACTER REFERENCE
261     * // Uncomment the following line to decode all character references instead of copying them verbatim:
262     * // characterReference.appendCharTo(writer); continue;
263     * }
264     * else
265     * {
266     * // HANDLE PLAIN TEXT
267     * }
268     * // unless specific handling has prevented getting to here, simply output the segment as is:
269     * writer.write(nodeSegment.toString());
270     * }
271     * </pre>
272     *
273     * </dd>
274 zoffadmin 2 * </dl>
275 zoffadmin 4 *
276 zoffadmin 2 * @return an iterator over every {@linkplain Tag tag}, {@linkplain CharacterReference character reference} and <a href="Source.html#PlainText">plain text</a> segment contained within this segment.
277     */
278 zoffadmin 4 public Iterator<Segment> getNodeIterator()
279     {
280 zoffadmin 2 return new NodeIterator(this);
281     }
282    
283     /**
284     * Returns a list of all {@link Tag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
285     * <p>
286 zoffadmin 4 * The {@link Source#fullSequentialParse()} method should be called after construction of the {@link Source} object if this method is to be used on a large proportion of the source. It is called automatically if this method is called on the {@link Source} object itself.
287 zoffadmin 2 * <p>
288     * See the {@link Tag} class documentation for more details about the behaviour of this method.
289 zoffadmin 4 *
290 zoffadmin 2 * @return a list of all {@link Tag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
291     */
292 zoffadmin 4 public List<Tag> getAllTags()
293     {
294 zoffadmin 2 return getAllTags(null);
295     }
296    
297     /**
298     * Returns a list of all {@link Tag} objects of the specified {@linkplain TagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
299     * <p>
300     * See the {@link Tag} class documentation for more details about the behaviour of this method.
301     * <p>
302     * Specifying a <code>null</code> argument to the <code>tagType</code> parameter is equivalent to {@link #getAllTags()}.
303 zoffadmin 4 *
304     * @param tagType
305     * the {@linkplain TagType type} of tags to get.
306 zoffadmin 2 * @return a list of all {@link Tag} objects of the specified {@linkplain TagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
307     * @see #getAllStartTags(StartTagType)
308     */
309 zoffadmin 4 public List<Tag> getAllTags(final TagType tagType)
310     {
311     Tag tag = checkTagEnclosure(Tag.getNextTag(source, begin, tagType));
312     if (tag == null) return Collections.emptyList();
313     final ArrayList<Tag> list = new ArrayList<Tag>();
314     do
315     {
316 zoffadmin 2 list.add(tag);
317 zoffadmin 4 tag = checkTagEnclosure(tag.getNextTag(tagType));
318     }
319     while (tag != null);
320 zoffadmin 2 return list;
321     }
322    
323     /**
324     * Returns a list of all {@link StartTag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
325     * <p>
326 zoffadmin 4 * The {@link Source#fullSequentialParse()} method should be called after construction of the {@link Source} object if this method is to be used on a large proportion of the source. It is called automatically if this method is called on the {@link Source} object itself.
327 zoffadmin 2 * <p>
328     * See the {@link Tag} class documentation for more details about the behaviour of this method.
329 zoffadmin 4 *
330 zoffadmin 2 * @return a list of all {@link StartTag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
331     */
332 zoffadmin 4 public List<StartTag> getAllStartTags()
333     {
334     StartTag startTag = checkEnclosure(StartTag.getNext(source, begin));
335     if (startTag == null) return Collections.emptyList();
336     final ArrayList<StartTag> list = new ArrayList<StartTag>();
337     do
338     {
339 zoffadmin 2 list.add(startTag);
340 zoffadmin 4 startTag = checkEnclosure(startTag.getNextStartTag());
341     }
342     while (startTag != null);
343 zoffadmin 2 return list;
344     }
345    
346     /**
347     * Returns a list of all {@link StartTag} objects of the specified {@linkplain StartTagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
348     * <p>
349     * See the {@link Tag} class documentation for more details about the behaviour of this method.
350     * <p>
351     * Specifying a <code>null</code> argument to the <code>startTagType</code> parameter is equivalent to {@link #getAllStartTags()}.
352 zoffadmin 4 *
353     * @param startTagType
354     * the {@linkplain StartTagType type} of tags to get.
355 zoffadmin 2 * @return a list of all {@link StartTag} objects of the specified {@linkplain StartTagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
356     */
357 zoffadmin 4 public List<StartTag> getAllStartTags(final StartTagType startTagType)
358     {
359     if (startTagType == null) return getAllStartTags();
360     StartTag startTag = (StartTag) checkTagEnclosure(Tag.getNextTag(source, begin, startTagType));
361     if (startTag == null) return Collections.emptyList();
362     final ArrayList<StartTag> list = new ArrayList<StartTag>();
363     do
364     {
365 zoffadmin 2 list.add(startTag);
366 zoffadmin 4 startTag = (StartTag) checkTagEnclosure(startTag.getNextTag(startTagType));
367     }
368     while (startTag != null);
369 zoffadmin 2 return list;
370     }
371    
372     /**
373     * Returns a list of all {@linkplain StartTagType#NORMAL normal} {@link StartTag} objects with the specified {@linkplain StartTag#getName() name} that are {@linkplain #encloses(Segment) enclosed} by this segment.
374     * <p>
375     * See the {@link Tag} class documentation for more details about the behaviour of this method.
376     * <p>
377     * Specifying a <code>null</code> argument to the <code>name</code> parameter is equivalent to {@link #getAllStartTags()}, which may include non-{@linkplain StartTagType#NORMAL normal} start tags.
378     * <p>
379     * This method also returns {@linkplain Tag#isUnregistered() unregistered} tags if the specified name is not a valid {@linkplain Tag#isXMLName(CharSequence) XML tag name}.
380 zoffadmin 4 *
381     * @param name
382     * the {@linkplain StartTag#getName() name} of the start tags to get.
383 zoffadmin 2 * @return a list of all {@linkplain StartTagType#NORMAL normal} {@link StartTag} objects with the specified {@linkplain StartTag#getName() name} that are {@linkplain #encloses(Segment) enclosed} by this segment.
384     */
385 zoffadmin 4 public List<StartTag> getAllStartTags(String name)
386     {
387     if (name == null) return getAllStartTags();
388     final boolean isXMLTagName = Tag.isXMLName(name);
389     StartTag startTag = checkEnclosure(StartTag.getNext(source, begin, name, StartTagType.NORMAL, isXMLTagName));
390     if (startTag == null) return Collections.emptyList();
391     final ArrayList<StartTag> list = new ArrayList<StartTag>();
392     do
393     {
394 zoffadmin 2 list.add(startTag);
395 zoffadmin 4 startTag = checkEnclosure(StartTag.getNext(source, startTag.begin + 1, name, StartTagType.NORMAL, isXMLTagName));
396     }
397     while (startTag != null);
398 zoffadmin 2 return list;
399     }
400    
401     /**
402     * Returns a list of all {@link StartTag} objects with the specified attribute name/value pair that are {@linkplain #encloses(Segment) enclosed} by this segment.
403     * <p>
404     * See the {@link Tag} class documentation for more details about the behaviour of this method.
405 zoffadmin 4 *
406     * @param attributeName
407     * the attribute name (case insensitive) to search for, must not be <code>null</code>.
408     * @param value
409     * the value of the specified attribute to search for, must not be <code>null</code>.
410     * @param valueCaseSensitive
411     * specifies whether the attribute value matching is case sensitive.
412 zoffadmin 2 * @return a list of all {@link StartTag} objects with the specified attribute name/value pair that are {@linkplain #encloses(Segment) enclosed} by this segment.
413     * @see #getAllStartTags(String attributeName, Pattern valueRegexPattern)
414     */
415 zoffadmin 4 public List<StartTag> getAllStartTags(final String attributeName, final String value, final boolean valueCaseSensitive)
416     {
417     StartTag startTag = checkEnclosure(source.getNextStartTag(begin, attributeName, value, valueCaseSensitive));
418     if (startTag == null) return Collections.emptyList();
419     final ArrayList<StartTag> list = new ArrayList<StartTag>();
420     do
421     {
422 zoffadmin 2 list.add(startTag);
423 zoffadmin 4 startTag = checkEnclosure(source.getNextStartTag(startTag.begin + 1, attributeName, value, valueCaseSensitive));
424     }
425     while (startTag != null);
426 zoffadmin 2 return list;
427     }
428    
429     /**
430     * Returns a list of all {@link StartTag} objects with the specified attribute name and value pattern that are {@linkplain #encloses(Segment) enclosed} by this segment.
431     * <p>
432 zoffadmin 4 * Specifying a <code>null</code> argument to the <code>valueRegexPattern</code> parameter performs the search on the attribute name only, without regard to the attribute value. This will also match an attribute that {@linkplain Attribute#hasValue() has no value} at all.
433 zoffadmin 2 * <p>
434     * See the {@link Tag} class documentation for more details about the behaviour of this method.
435 zoffadmin 4 *
436     * @param attributeName
437     * the attribute name (case insensitive) to search for, must not be <code>null</code>.
438     * @param valueRegexPattern
439     * the regular expression pattern that must match the attribute value, may be <code>null</code>.
440 zoffadmin 2 * @return a list of all {@link StartTag} objects with the specified attribute name and value pattern that are {@linkplain #encloses(Segment) enclosed} by this segment.
441     * @see #getAllStartTags(String attributeName, String value, boolean valueCaseSensitive)
442     */
443 zoffadmin 4 public List<StartTag> getAllStartTags(final String attributeName, final Pattern valueRegexPattern)
444     {
445     StartTag startTag = checkEnclosure(source.getNextStartTag(begin, attributeName, valueRegexPattern));
446     if (startTag == null) return Collections.emptyList();
447     final ArrayList<StartTag> list = new ArrayList<StartTag>();
448     do
449     {
450 zoffadmin 2 list.add(startTag);
451 zoffadmin 4 startTag = checkEnclosure(source.getNextStartTag(startTag.begin + 1, attributeName, valueRegexPattern));
452     }
453     while (startTag != null);
454 zoffadmin 2 return list;
455     }
456    
457     /**
458     * Returns a list of all {@link StartTag} objects with the specified class that are {@linkplain #encloses(Segment) enclosed} by this segment.
459     * <p>
460 zoffadmin 4 * This matches start tags with a <code>class</code> attribute that contains the specified class name, either as an exact match or where the specified class name is one of multiple class names separated by white space in the attribute value.
461 zoffadmin 2 * <p>
462     * See the {@link Tag} class documentation for more details about the behaviour of this method.
463 zoffadmin 4 *
464     * @param className
465     * the class name (case sensitive) to search for, must not be <code>null</code>.
466 zoffadmin 2 * @return a list of all {@link StartTag} objects with the specified class that are {@linkplain #encloses(Segment) enclosed} by this segment.
467     */
468 zoffadmin 4 public List<StartTag> getAllStartTagsByClass(final String className)
469     {
470     return getAllStartTags("class", getClassPattern(className));
471 zoffadmin 2 }
472    
473     /**
474     * Returns a list of the immediate children of this segment in the document element hierarchy.
475     * <p>
476     * The returned list may include an element that extends beyond the end of this segment, as long as it begins within this segment.
477     * <p>
478 zoffadmin 4 * An element found at the start of this segment is included in the list. Note however that if this segment <i>is</i> an {@link Element}, the overriding {@link Element#getChildElements()} method is called instead, which only returns the children of the element.
479 zoffadmin 2 * <p>
480     * Calling <code>getChildElements()</code> on an <code>Element</code> is much more efficient than calling it on a <code>Segment</code>.
481     * <p>
482     * The objects in the list are all of type {@link Element}.
483     * <p>
484 zoffadmin 4 * The {@link Source#fullSequentialParse()} method should be called after construction of the {@link Source} object if this method is to be used on a large proportion of the source. It is called automatically if this method is called on the {@link Source} object itself.
485 zoffadmin 2 * <p>
486     * See the {@link Source#getChildElements()} method for more details.
487 zoffadmin 4 *
488 zoffadmin 2 * @return the a list of the immediate children of this segment in the document element hierarchy, guaranteed not <code>null</code>.
489     * @see Element#getParentElement()
490     */
491 zoffadmin 4 public List<Element> getChildElements()
492     {
493     if (length() == 0) return Collections.emptyList();
494     List<Element> childElements = new ArrayList<Element>();
495     int pos = begin;
496     while (true)
497     {
498     final StartTag childStartTag = source.getNextStartTag(pos);
499     if (childStartTag == null || childStartTag.begin >= end) break;
500     if (!Config.IncludeServerTagsInElementHierarchy && childStartTag.getTagType().isServerTag())
501     {
502     pos = childStartTag.end;
503 zoffadmin 2 continue;
504     }
505 zoffadmin 4 final Element childElement = childStartTag.getElement();
506 zoffadmin 2 childElements.add(childElement);
507     childElement.getChildElements();
508 zoffadmin 4 pos = childElement.end;
509 zoffadmin 2 }
510     return childElements;
511     }
512    
513     /**
514     * Returns a list of all {@link Element} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
515     * <p>
516 zoffadmin 4 * The {@link Source#fullSequentialParse()} method should be called after construction of the {@link Source} object if this method is to be used on a large proportion of the source. It is called automatically if this method is called on the {@link Source} object itself.
517 zoffadmin 2 * <p>
518     * The elements returned correspond exactly with the start tags returned in the {@link #getAllStartTags()} method.
519     * <p>
520     * If this segment is itself an {@link Element}, the result includes this element in the list.
521 zoffadmin 4 *
522 zoffadmin 2 * @return a list of all {@link Element} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
523     */
524 zoffadmin 4 public List<Element> getAllElements()
525     {
526 zoffadmin 2 return getAllElements(getAllStartTags());
527     }
528    
529     /**
530     * Returns a list of all {@link Element} objects with the specified name that are {@linkplain #encloses(Segment) enclosed} by this segment.
531     * <p>
532 zoffadmin 4 * The elements returned correspond with the start tags returned in the {@link #getAllStartTags(String name)} method, except that elements which are not entirely enclosed by this segment are excluded.
533 zoffadmin 2 * <p>
534     * Specifying a <code>null</code> argument to the <code>name</code> parameter is equivalent to {@link #getAllElements()}, which may include elements of non-{@linkplain StartTagType#NORMAL normal} tags.
535     * <p>
536     * This method also returns elements consisting of {@linkplain Tag#isUnregistered() unregistered} tags if the specified name is not a valid {@linkplain Tag#isXMLName(CharSequence) XML tag name}.
537     * <p>
538     * If this segment is itself an {@link Element} with the specified name, the result includes this element in the list.
539 zoffadmin 4 *
540     * @param name
541     * the {@linkplain Element#getName() name} of the elements to get.
542 zoffadmin 2 * @return a list of all {@link Element} objects with the specified name that are {@linkplain #encloses(Segment) enclosed} by this segment.
543     */
544 zoffadmin 4 public List<Element> getAllElements(String name)
545     {
546 zoffadmin 2 return getAllElements(getAllStartTags(name));
547     }
548    
549     /**
550     * Returns a list of all {@link Element} objects with start tags of the specified {@linkplain StartTagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
551     * <p>
552 zoffadmin 4 * The elements returned correspond with the start tags returned in the {@link #getAllTags(TagType)} method, except that elements which are not entirely enclosed by this segment are excluded.
553 zoffadmin 2 * <p>
554     * If this segment is itself an {@link Element} with the specified type, the result includes this element in the list.
555 zoffadmin 4 *
556     * @param startTagType
557     * the {@linkplain StartTagType type} of start tags to get, must not be <code>null</code>.
558 zoffadmin 2 * @return a list of all {@link Element} objects with start tags of the specified {@linkplain StartTagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
559     */
560 zoffadmin 4 public List<Element> getAllElements(final StartTagType startTagType)
561     {
562     if (startTagType == null) throw new IllegalArgumentException("startTagType argument must not be null");
563 zoffadmin 2 return getAllElements(getAllStartTags(startTagType));
564     }
565    
566     /**
567     * Returns a list of all {@link Element} objects with the specified attribute name/value pair that are {@linkplain #encloses(Segment) enclosed} by this segment.
568     * <p>
569 zoffadmin 4 * The elements returned correspond with the start tags returned in the {@link #getAllStartTags(String attributeName, String value, boolean valueCaseSensitive)} method, except that elements which are not entirely enclosed by this segment are excluded.
570 zoffadmin 2 * <p>
571     * If this segment is itself an {@link Element} with the specified name/value pair, the result includes this element in the list.
572 zoffadmin 4 *
573     * @param attributeName
574     * the attribute name (case insensitive) to search for, must not be <code>null</code>.
575     * @param value
576     * the value of the specified attribute to search for, must not be <code>null</code>.
577     * @param valueCaseSensitive
578     * specifies whether the attribute value matching is case sensitive.
579 zoffadmin 2 * @return a list of all {@link Element} objects with the specified attribute name/value pair that are {@linkplain #encloses(Segment) enclosed} by this segment.
580     * @see #getAllElements(String attributeName, Pattern valueRegexPattern)
581     */
582 zoffadmin 4 public List<Element> getAllElements(final String attributeName, final String value, final boolean valueCaseSensitive)
583     {
584     return getAllElements(getAllStartTags(attributeName, value, valueCaseSensitive));
585 zoffadmin 2 }
586    
587     /**
588     * Returns a list of all {@link Element} objects with the specified attribute name and value pattern that are {@linkplain #encloses(Segment) enclosed} by this segment.
589     * <p>
590 zoffadmin 4 * The elements returned correspond with the start tags returned in the {@link #getAllStartTags(String attributeName, Pattern valueRegexPattern)} method, except that elements which are not entirely enclosed by this segment are excluded.
591 zoffadmin 2 * <p>
592 zoffadmin 4 * Specifying a <code>null</code> argument to the <code>valueRegexPattern</code> parameter performs the search on the attribute name only, without regard to the attribute value. This will also match an attribute that {@linkplain Attribute#hasValue() has no value} at all.
593 zoffadmin 2 * <p>
594     * If this segment is itself an {@link Element} with the specified attribute name and value pattern, the result includes this element in the list.
595 zoffadmin 4 *
596     * @param attributeName
597     * the attribute name (case insensitive) to search for, must not be <code>null</code>.
598     * @param valueRegexPattern
599     * the regular expression pattern that must match the attribute value, may be <code>null</code>.
600 zoffadmin 2 * @return a list of all {@link Element} objects with the specified attribute name and value pattern that are {@linkplain #encloses(Segment) enclosed} by this segment.
601     * @see #getAllElements(String attributeName, String value, boolean valueCaseSensitive)
602     */
603 zoffadmin 4 public List<Element> getAllElements(final String attributeName, final Pattern valueRegexPattern)
604     {
605     return getAllElements(getAllStartTags(attributeName, valueRegexPattern));
606 zoffadmin 2 }
607    
608     /**
609     * Returns a list of all {@link Element} objects with the specified class that are {@linkplain #encloses(Segment) enclosed} by this segment.
610     * <p>
611 zoffadmin 4 * This matches elements with a <code>class</code> attribute that contains the specified class name, either as an exact match or where the specified class name is one of multiple class names separated by white space in the attribute value.
612 zoffadmin 2 * <p>
613 zoffadmin 4 * The elements returned correspond with the start tags returned in the {@link #getAllStartTagsByClass(String className)} method, except that elements which are not entirely enclosed by this segment are excluded.
614 zoffadmin 2 * <p>
615     * If this segment is itself an {@link Element} with the specified class, the result includes this element in the list.
616 zoffadmin 4 *
617     * @param className
618     * the class name (case sensitive) to search for, must not be <code>null</code>.
619 zoffadmin 2 * @return a list of all {@link Element} objects with the specified class that are {@linkplain #encloses(Segment) enclosed} by this segment.
620     */
621 zoffadmin 4 public List<Element> getAllElementsByClass(final String className)
622     {
623 zoffadmin 2 return getAllElements(getAllStartTagsByClass(className));
624     }
625    
626     /**
627     * Returns a list of all {@link CharacterReference} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
628 zoffadmin 4 *
629 zoffadmin 2 * @return a list of all {@link CharacterReference} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
630     */
631 zoffadmin 4 public List<CharacterReference> getAllCharacterReferences()
632     {
633     CharacterReference characterReference = getNextCharacterReference(begin);
634     if (characterReference == null) return Collections.emptyList();
635     final ArrayList<CharacterReference> list = new ArrayList<CharacterReference>();
636     do
637     {
638 zoffadmin 2 list.add(characterReference);
639 zoffadmin 4 characterReference = getNextCharacterReference(characterReference.end);
640     }
641     while (characterReference != null);
642 zoffadmin 2 return list;
643     }
644    
645     /**
646     * Returns a list of all {@linkplain Attribute attributes} {@linkplain #encloses(Segment) enclosed} by this segment that have <a target="_blank" href="http://en.wikipedia.org/wiki/URI">URI</a> values.
647     * <p>
648     * According to the <a target="_blank" href="http://www.w3.org/TR/html401/">HTML 4.01 specification</a>, the following attributes have URI values:
649     * <table class="bordered" cellspacing="0">
650 zoffadmin 4 * <tr>
651     * <th>HTML element name
652     * <th>Attribute name
653     * <tr>
654     * <td>{@link HTMLElementName#A A}
655     * <td>href
656     * <tr>
657     * <td>{@link HTMLElementName#APPLET APPLET}
658     * <td>codebase
659     * <tr>
660     * <td>{@link HTMLElementName#AREA AREA}
661     * <td>href
662     * <tr>
663     * <td>{@link HTMLElementName#BASE BASE}
664     * <td>href
665     * <tr>
666     * <td>{@link HTMLElementName#BLOCKQUOTE BLOCKQUOTE}
667     * <td>cite
668     * <tr>
669     * <td>{@link HTMLElementName#BODY BODY}
670     * <td>background
671     * <tr>
672     * <td>{@link HTMLElementName#FORM FORM}
673     * <td>action
674     * <tr>
675     * <td>{@link HTMLElementName#FRAME FRAME}
676     * <td>longdesc
677     * <tr>
678     * <td>{@link HTMLElementName#FRAME FRAME}
679     * <td>src
680     * <tr>
681     * <td>{@link HTMLElementName#DEL DEL}
682     * <td>cite
683     * <tr>
684     * <td>{@link HTMLElementName#HEAD HEAD}
685     * <td>profile
686     * <tr>
687     * <td>{@link HTMLElementName#IFRAME IFRAME}
688     * <td>longdesc
689     * <tr>
690     * <td>{@link HTMLElementName#IFRAME IFRAME}
691     * <td>src
692     * <tr>
693     * <td>{@link HTMLElementName#IMG IMG}
694     * <td>longdesc
695     * <tr>
696     * <td>{@link HTMLElementName#IMG IMG}
697     * <td>src
698     * <tr>
699     * <td>{@link HTMLElementName#IMG IMG}
700     * <td>usemap
701     * <tr>
702     * <td>{@link HTMLElementName#INPUT INPUT}
703     * <td>src
704     * <tr>
705     * <td>{@link HTMLElementName#INPUT INPUT}
706     * <td>usemap
707     * <tr>
708     * <td>{@link HTMLElementName#INS INS}
709     * <td>cite
710     * <tr>
711     * <td>{@link HTMLElementName#LINK LINK}
712     * <td>href
713     * <tr>
714     * <td>{@link HTMLElementName#OBJECT OBJECT}
715     * <td>classid
716     * <tr>
717     * <td>{@link HTMLElementName#OBJECT OBJECT}
718     * <td>codebase
719     * <tr>
720     * <td>{@link HTMLElementName#OBJECT OBJECT}
721     * <td>data
722     * <tr>
723     * <td>{@link HTMLElementName#OBJECT OBJECT}
724     * <td>usemap
725     * <tr>
726     * <td>{@link HTMLElementName#Q Q}
727     * <td>cite
728     * <tr>
729     * <td>{@link HTMLElementName#SCRIPT SCRIPT}
730     * <td>src
731 zoffadmin 2 * </table>
732     * <p>
733     * Attributes from other elements may also be returned if the attribute name matches one of those in the list above.
734     * <p>
735     * This method is often used in conjunction with the {@link #getStyleURISegments()} method in order to find all URIs in a document.
736     * <p>
737     * The attributes are returned in order of appearance.
738 zoffadmin 4 *
739 zoffadmin 2 * @return a list of all {@linkplain Attribute attributes} {@linkplain #encloses(Segment) enclosed} by this segment that have <a target="_blank" href="http://en.wikipedia.org/wiki/URI">URI</a> values.
740     * @see #getStyleURISegments()
741     */
742 zoffadmin 4 public List<Attribute> getURIAttributes()
743     {
744 zoffadmin 2 return URIAttributes.getList(this);
745     }
746    
747     /**
748 zoffadmin 4 * Returns a list of all <a target="_blank" href="http://en.wikipedia.org/wiki/URI">URI</a> {@linkplain Segment segments} inside <code>style</code> attribute values {@linkplain #encloses(Segment) enclosed} by this segment.
749 zoffadmin 2 * <p>
750     * If this segment does not contain any tags, the entire segment is assumed to be a <code>style</code> attribute value.
751     * <p>
752 zoffadmin 4 * The URI segments are found by searching the <code>style</code> attribute values for the functional notation "<code>url()</code>" as described in <a target="_blank" href="http://www.w3.org/TR/CSS2/syndata.html#uri">section 4.3.4 of the CSS2 specification</a>.
753 zoffadmin 2 * <p>
754     * The segments are returned in order of appearance.
755 zoffadmin 4 *
756 zoffadmin 2 * @return a list of all <a target="_blank" href="http://en.wikipedia.org/wiki/URI">URI</a> {@linkplain Segment segments} inside <code>style</code> attribute values {@linkplain #encloses(Segment) enclosed} by this segment.
757     * @see #getURIAttributes()
758     */
759 zoffadmin 4 public List<Segment> getStyleURISegments()
760     {
761 zoffadmin 2 return URIAttributes.getStyleURISegments(this);
762     }
763    
764     /**
765     * Returns the first {@link StartTag} {@linkplain #encloses(Segment) enclosed} by this segment.
766     * <p>
767 zoffadmin 4 * This is functionally equivalent to {@link #getAllStartTags()}<code>.iterator().next()</code>, but does not search beyond the first start tag and returns <code>null</code> if no such start tag exists.
768     *
769 zoffadmin 2 * @return the first {@link StartTag} {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
770     */
771 zoffadmin 4 public final StartTag getFirstStartTag()
772     {
773 zoffadmin 2 return checkEnclosure(source.getNextStartTag(begin));
774     }
775 zoffadmin 4
776 zoffadmin 2 /**
777     * Returns the first {@link StartTag} of the specified {@linkplain StartTagType type} {@linkplain #encloses(Segment) enclosed} by this segment.
778     * <p>
779 zoffadmin 4 * This is functionally equivalent to {@link #getAllStartTags(StartTagType) getAllStartTags(startTagType)}<code>.iterator().next()</code>, but does not search beyond the first start tag and returns <code>null</code> if no such start tag exists.
780     *
781     * @param startTagType
782     * the <code>StartTagType</code> to search for.
783 zoffadmin 2 * @return the first {@link StartTag} of the specified {@linkplain StartTagType type} {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
784     */
785 zoffadmin 4 public final StartTag getFirstStartTag(StartTagType startTagType)
786     {
787     return checkEnclosure(source.getNextStartTag(begin, startTagType));
788 zoffadmin 2 }
789 zoffadmin 4
790 zoffadmin 2 /**
791     * Returns the first {@linkplain StartTagType#NORMAL normal} {@link StartTag} {@linkplain #encloses(Segment) enclosed} by this segment.
792     * <p>
793 zoffadmin 4 * This is functionally equivalent to {@link #getAllStartTags(String) getAllStartTags(name)}<code>.iterator().next()</code>, but does not search beyond the first start tag and returns <code>null</code> if no such start tag exists.
794 zoffadmin 2 * <p>
795     * Specifying a <code>null</code> argument to the <code>name</code> parameter is equivalent to {@link #getFirstStartTag()}.
796 zoffadmin 4 *
797     * @param name
798     * the {@linkplain StartTag#getName() name} of the start tag to search for, may be <code>null</code>.
799 zoffadmin 2 * @return the first {@linkplain StartTagType#NORMAL normal} {@link StartTag} {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
800     */
801 zoffadmin 4 public final StartTag getFirstStartTag(String name)
802     {
803     return checkEnclosure(source.getNextStartTag(begin, name));
804 zoffadmin 2 }
805 zoffadmin 4
806 zoffadmin 2 /**
807     * Returns the first {@link StartTag} with the specified attribute name/value pair {@linkplain #encloses(Segment) enclosed} by this segment.
808     * <p>
809 zoffadmin 4 * This is functionally equivalent to {@link #getAllStartTags(String,String,boolean) getAllStartTags(attributeName,value,valueCaseSensitive)}<code>.iterator().next()</code>, but does not search beyond the first start tag and returns <code>null</code> if no such start tag exists.
810     *
811     * @param attributeName
812     * the attribute name (case insensitive) to search for, must not be <code>null</code>.
813     * @param value
814     * the value of the specified attribute to search for, must not be <code>null</code>.
815     * @param valueCaseSensitive
816     * specifies whether the attribute value matching is case sensitive.
817 zoffadmin 2 * @return the first {@link StartTag} with the specified attribute name/value pair {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
818     * @see #getFirstStartTag(String attributeName, Pattern valueRegexPattern)
819     */
820 zoffadmin 4 public final StartTag getFirstStartTag(String attributeName, String value, boolean valueCaseSensitive)
821     {
822     return checkEnclosure(source.getNextStartTag(begin, attributeName, value, valueCaseSensitive));
823 zoffadmin 2 }
824    
825     /**
826     * Returns the first {@link StartTag} with the specified attribute name and value pattern that is {@linkplain #encloses(Segment) enclosed} by this segment.
827     * <p>
828 zoffadmin 4 * This is functionally equivalent to {@link #getAllStartTags(String,Pattern) getAllStartTags(attributeName,valueRegexPattern)}<code>.iterator().next()</code>, but does not search beyond the first start tag and returns <code>null</code> if no such start tag exists.
829     *
830     * @param attributeName
831     * the attribute name (case insensitive) to search for, must not be <code>null</code>.
832     * @param valueRegexPattern
833     * the regular expression pattern that must match the attribute value, may be <code>null</code>.
834 zoffadmin 2 * @return the first {@link StartTag} with the specified attribute name and value pattern that is {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
835     * @see #getFirstStartTag(String attributeName, String value, boolean valueCaseSensitive)
836     */
837 zoffadmin 4 public final StartTag getFirstStartTag(final String attributeName, final Pattern valueRegexPattern)
838     {
839     return checkEnclosure(source.getNextStartTag(begin, attributeName, valueRegexPattern));
840 zoffadmin 2 }
841    
842     /**
843     * Returns the first {@link StartTag} with the specified class that is {@linkplain #encloses(Segment) enclosed} by this segment.
844     * <p>
845 zoffadmin 4 * This is functionally equivalent to {@link #getAllStartTagsByClass(String) getAllStartTagsByClass(className)}<code>.iterator().next()</code>, but does not search beyond the first start tag and returns <code>null</code> if no such start tag exists.
846     *
847     * @param className
848     * the class name (case sensitive) to search for, must not be <code>null</code>.
849 zoffadmin 2 * @return the first {@link StartTag} with the specified class that is {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
850     */
851 zoffadmin 4 public final StartTag getFirstStartTagByClass(final String className)
852     {
853     return checkEnclosure(source.getNextStartTagByClass(begin, className));
854 zoffadmin 2 }
855 zoffadmin 4
856 zoffadmin 2 /**
857     * Returns the first {@link Element} {@linkplain #encloses(Segment) enclosed} by this segment.
858     * <p>
859 zoffadmin 4 * This is functionally equivalent to {@link #getAllElements()}<code>.iterator().next()</code>, but does not search beyond the first enclosed element and returns <code>null</code> if no such element exists.
860 zoffadmin 2 * <p>
861     * If this segment is itself an {@link Element}, this element is returned, not the first child element.
862 zoffadmin 4 *
863 zoffadmin 2 * @return the first {@link Element} {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
864     */
865 zoffadmin 4 public final Element getFirstElement()
866     {
867     StartTag startTag = checkEnclosure(StartTag.getNext(source, begin));
868     while (startTag != null)
869     {
870     final Element element = startTag.getElement();
871     if (element.end <= end) return element;
872     startTag = checkEnclosure(startTag.getNextStartTag());
873 zoffadmin 2 }
874     return null;
875     }
876 zoffadmin 4
877 zoffadmin 2 /**
878     * Returns the first {@linkplain StartTagType#NORMAL normal} {@link Element} with the specified {@linkplain Element#getName() name} {@linkplain #encloses(Segment) enclosed} by this segment.
879     * <p>
880 zoffadmin 4 * This is functionally equivalent to {@link #getAllElements(String) getAllElements(name)}<code>.iterator().next()</code>, but does not search beyond the first enclosed element and returns <code>null</code> if no such element exists.
881 zoffadmin 2 * <p>
882     * Specifying a <code>null</code> argument to the <code>name</code> parameter is equivalent to {@link #getFirstElement()}.
883     * <p>
884     * If this segment is itself an {@link Element} with the specified name, this element is returned.
885 zoffadmin 4 *
886     * @param name
887     * the {@linkplain Element#getName() name} of the element to search for.
888 zoffadmin 2 * @return the first {@linkplain StartTagType#NORMAL normal} {@link Element} with the specified {@linkplain Element#getName() name} {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
889     */
890 zoffadmin 4 public final Element getFirstElement(String name)
891     {
892     if (name == null) return getFirstElement();
893     final boolean isXMLTagName = Tag.isXMLName(name);
894     StartTag startTag = checkEnclosure(StartTag.getNext(source, begin, name, StartTagType.NORMAL, isXMLTagName));
895     while (startTag != null)
896     {
897     final Element element = startTag.getElement();
898     if (element.end <= end) return element;
899     startTag = checkEnclosure(StartTag.getNext(source, startTag.begin + 1, name, StartTagType.NORMAL, isXMLTagName));
900 zoffadmin 2 }
901     return null;
902     }
903 zoffadmin 4
904 zoffadmin 2 /**
905     * Returns the first {@link Element} with the specified attribute name/value pair {@linkplain #encloses(Segment) enclosed} by this segment.
906     * <p>
907 zoffadmin 4 * This is functionally equivalent to {@link #getAllElements(String,String,boolean) getAllElements(attributeName,value,valueCaseSensitive)}<code>.iterator().next()</code>, but does not search beyond the first enclosed element and returns <code>null</code> if no such element exists.
908 zoffadmin 2 * <p>
909     * If this segment is itself an {@link Element} with the specified attribute name/value pair, this element is returned.
910 zoffadmin 4 *
911     * @param attributeName
912     * the attribute name (case insensitive) to search for, must not be <code>null</code>.
913     * @param value
914     * the value of the specified attribute to search for, must not be <code>null</code>.
915     * @param valueCaseSensitive
916     * specifies whether the attribute value matching is case sensitive.
917 zoffadmin 2 * @return the first {@link Element} with the specified attribute name/value pair {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
918     * @see #getFirstElement(String attributeName, Pattern valueRegexPattern)
919     */
920 zoffadmin 4 public final Element getFirstElement(String attributeName, String value, boolean valueCaseSensitive)
921     {
922     StartTag startTag = checkEnclosure(source.getNextStartTag(begin, attributeName, value, valueCaseSensitive));
923     while (startTag != null)
924     {
925     final Element element = startTag.getElement();
926     if (element.end <= end) return element;
927     startTag = checkEnclosure(source.getNextStartTag(startTag.begin + 1, attributeName, value, valueCaseSensitive));
928 zoffadmin 2 }
929     return null;
930     }
931    
932     /**
933     * Returns the first {@link Element} with the specified attribute name and value pattern that is {@linkplain #encloses(Segment) enclosed} by this segment.
934     * <p>
935 zoffadmin 4 * This is functionally equivalent to {@link #getAllElements(String,Pattern) getAllElements(attributeName,valueRegexPattern)}<code>.iterator().next()</code>, but does not search beyond the first enclosed element and returns <code>null</code> if no such element exists.
936 zoffadmin 2 * <p>
937     * If this segment is itself an {@link Element} with the specified attribute name and value pattern, this element is returned.
938 zoffadmin 4 *
939     * @param attributeName
940     * the attribute name (case insensitive) to search for, must not be <code>null</code>.
941     * @param valueRegexPattern
942     * the regular expression pattern that must match the attribute value, may be <code>null</code>.
943 zoffadmin 2 * @return the first {@link Element} with the specified attribute name and value pattern that is {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
944     * @see #getFirstElement(String attributeName, String value, boolean valueCaseSensitive)
945     */
946 zoffadmin 4 public final Element getFirstElement(final String attributeName, final Pattern valueRegexPattern)
947     {
948     StartTag startTag = checkEnclosure(source.getNextStartTag(begin, attributeName, valueRegexPattern));
949     while (startTag != null)
950     {
951     final Element element = startTag.getElement();
952     if (element.end <= end) return element;
953     startTag = checkEnclosure(source.getNextStartTag(startTag.begin + 1, attributeName, valueRegexPattern));
954 zoffadmin 2 }
955     return null;
956     }
957    
958     /**
959     * Returns the first {@link Element} with the specified class that is {@linkplain #encloses(Segment) enclosed} by this segment.
960     * <p>
961 zoffadmin 4 * This is functionally equivalent to {@link #getAllElementsByClass(String) getAllElementsByClass(className)}<code>.iterator().next()</code>, but does not search beyond the first enclosed element and returns <code>null</code> if no such element exists.
962 zoffadmin 2 * <p>
963     * If this segment is itself an {@link Element} with the specified class, this element is returned.
964 zoffadmin 4 *
965     * @param className
966     * the class name (case sensitive) to search for, must not be <code>null</code>.
967 zoffadmin 2 * @return the first {@link Element} with the specified class that is {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
968     */
969 zoffadmin 4 public final Element getFirstElementByClass(final String className)
970     {
971     StartTag startTag = checkEnclosure(source.getNextStartTagByClass(begin, className));
972     while (startTag != null)
973     {
974     final Element element = startTag.getElement();
975     if (element.end <= end) return element;
976     startTag = checkEnclosure(source.getNextStartTagByClass(startTag.begin + 1, className));
977 zoffadmin 2 }
978     return null;
979     }
980    
981     /**
982     * Returns a list of the {@link FormControl} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
983 zoffadmin 4 *
984 zoffadmin 2 * @return a list of the {@link FormControl} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
985     */
986 zoffadmin 4 public List<FormControl> getFormControls()
987     {
988 zoffadmin 2 return FormControl.getAll(this);
989     }
990    
991     /**
992     * Returns the {@link FormFields} object representing all form fields that are {@linkplain #encloses(Segment) enclosed} by this segment.
993     * <p>
994     * This is equivalent to {@link FormFields#FormFields(Collection) new FormFields}<code>(</code>{@link #getFormControls()}<code>)</code>.
995 zoffadmin 4 *
996 zoffadmin 2 * @return the {@link FormFields} object representing all form fields that are {@linkplain #encloses(Segment) enclosed} by this segment.
997     * @see #getFormControls()
998     */
999 zoffadmin 4 public FormFields getFormFields()
1000     {
1001 zoffadmin 2 return new FormFields(getFormControls());
1002     }
1003    
1004     /**
1005     * Parses any {@link Attributes} within this segment.
1006     * This method is only used in the unusual situation where attributes exist outside of a start tag.
1007     * The {@link StartTag#getAttributes()} method should be used in normal situations.
1008     * <p>
1009     * This is equivalent to <code>source.</code>{@link Source#parseAttributes(int,int) parseAttributes}<code>(</code>{@link #getBegin()}<code>,</code>{@link #getEnd()}<code>)</code>.
1010 zoffadmin 4 *
1011 zoffadmin 2 * @return the {@link Attributes} within this segment, or <code>null</code> if too many errors occur while parsing.
1012     */
1013 zoffadmin 4 public Attributes parseAttributes()
1014     {
1015     return source.parseAttributes(begin, end);
1016 zoffadmin 2 }
1017    
1018     /**
1019     * Causes the this segment to be ignored when parsing.
1020     * <p>
1021     * Ignored segments are treated as blank spaces by the parsing mechanism, but are included as normal text in all other functions.
1022     * <p>
1023 zoffadmin 4 * This method was originally the only means of preventing {@linkplain TagType#isServerTag() server tags} located inside {@linkplain StartTagType#NORMAL normal} tags from interfering with the parsing of the tags (such as where an {@linkplain Attribute attribute} of a normal tag uses a server tag to dynamically set its value), as well as preventing non-server tags from being recognised inside server tags.
1024 zoffadmin 2 * <p>
1025 zoffadmin 4 * It is not necessary to use this method to ignore {@linkplain TagType#isServerTag() server tags} located inside normal tags, as the attributes parser automatically ignores any server tags.
1026 zoffadmin 2 * <p>
1027 zoffadmin 4 * It is not necessary to use this method to ignore non-server tags inside server tags, or the contents of {@link HTMLElementName#SCRIPT SCRIPT} elements, as the parser does this automatically when performing a {@linkplain Source#fullSequentialParse() full sequential parse}.
1028 zoffadmin 2 * <p>
1029     * This leaves only very few scenarios where calling this method still provides a significant benefit.
1030     * <p>
1031 zoffadmin 4 * One such case is where XML-style server tags are used inside {@linkplain StartTagType#NORMAL normal} tags. Here is an example using an XML-style JSP tag: <blockquote class="code"><code>&lt;a href="&lt;i18n:resource path="/Portal"/&gt;?BACK=TRUE"&gt;back&lt;/a&gt;</code></blockquote> The first double-quote of <code>"/Portal"</code> will be interpreted as the end quote for the <code>href</code> attribute, as there is no way for the parser to recognise the <code>il8n:resource</code> element
1032     * as a server tag. Such use of XML-style server tags inside {@linkplain StartTagType#NORMAL normal} tags is generally seen as bad practice, but it is nevertheless valid JSP. The only way to ensure that this library is able to parse the normal tag surrounding it is to find these server tags first and call the <code>ignoreWhenParsing</code> method to ignore them before parsing the rest of the document.
1033 zoffadmin 2 * <p>
1034 zoffadmin 4 * It is important to understand the difference between ignoring the segment when parsing and removing the segment completely. Any text inside a segment that is ignored when parsing is treated by most functions as content, and as such is included in the output of tools such as {@link TextExtractor} and {@link Renderer}.
1035 zoffadmin 2 * <p>
1036 zoffadmin 4 * To remove segments completely, create an {@link OutputDocument} and call its {@link OutputDocument#remove(Segment) remove(Segment)} or {@link OutputDocument#replaceWithSpaces(int,int) replaceWithSpaces(int begin, int end)} method for each segment. Then create a new source document using {@link Source#Source(CharSequence) new Source(outputDocument.toString())} and perform the desired operations on this new source object.
1037 zoffadmin 2 * <p>
1038     * Calling this method after the {@link Source#fullSequentialParse()} method has been called is not permitted and throws an <code>IllegalStateException</code>.
1039     * <p>
1040 zoffadmin 4 * Any tags appearing in this segment that are found before this method is called will remain in the {@linkplain Source#getCacheDebugInfo() tag cache}, and so will continue to be found by the <a href="Tag.html#TagSearchMethods">tag search methods</a>. If this is undesirable, the {@link Source#clearCache()} method can be called to remove them from the cache. Calling the {@link Source#fullSequentialParse()} method after this method clears the cache automatically.
1041 zoffadmin 2 * <p>
1042 zoffadmin 4 * For best performance, this method should be called on all segments that need to be ignored without calling any of the <a href="Tag.html#TagSearchMethods">tag search methods</a> in between.
1043     *
1044 zoffadmin 2 * @see Source#ignoreWhenParsing(Collection segments)
1045     */
1046 zoffadmin 4 public void ignoreWhenParsing()
1047     {
1048     source.ignoreWhenParsing(begin, end);
1049 zoffadmin 2 }
1050    
1051     /**
1052     * Compares this <code>Segment</code> object to another object.
1053     * <p>
1054     * If the argument is not a <code>Segment</code>, a <code>ClassCastException</code> is thrown.
1055     * <p>
1056 zoffadmin 4 * A segment is considered to be before another segment if its begin position is earlier, or in the case that both segments begin at the same position, its end position is earlier.
1057 zoffadmin 2 * <p>
1058 zoffadmin 4 * Segments that begin and end at the same position are considered equal for the purposes of this comparison, even if they relate to different source documents.
1059 zoffadmin 2 * <p>
1060 zoffadmin 4 * Note: this class has a natural ordering that is inconsistent with equals. This means that this method may return zero in some cases where calling the {@link #equals(Object)} method with the same argument returns <code>false</code>.
1061     *
1062     * @param segment
1063     * the segment to be compared
1064 zoffadmin 2 * @return a negative integer, zero, or a positive integer as this segment is before, equal to, or after the specified segment.
1065 zoffadmin 4 * @throws ClassCastException
1066     * if the argument is not a <code>Segment</code>
1067 zoffadmin 2 */
1068 zoffadmin 4 public int compareTo(final Segment segment)
1069     {
1070     if (this == segment) return 0;
1071     if (begin < segment.begin) return -1;
1072     if (begin > segment.begin) return 1;
1073     if (end < segment.end) return -1;
1074     if (end > segment.end) return 1;
1075 zoffadmin 2 return 0;
1076     }
1077    
1078     /**
1079     * Indicates whether this segment consists entirely of {@linkplain #isWhiteSpace(char) white space}.
1080 zoffadmin 4 *
1081 zoffadmin 2 * @return <code>true</code> if this segment consists entirely of {@linkplain #isWhiteSpace(char) white space}, otherwise <code>false</code>.
1082     */
1083 zoffadmin 4 public final boolean isWhiteSpace()
1084     {
1085     for (int i = begin; i < end; i++)
1086 zoffadmin 2 if (!isWhiteSpace(source.charAt(i))) return false;
1087     return true;
1088     }
1089    
1090     /**
1091     * Returns an indication of the maximum depth of nested elements within this segment.
1092     * <p>
1093 zoffadmin 4 * A high return value can indicate that the segment contains a large number of incorrectly nested tags that could result in a <code>StackOverflowException</code> if its content is parsed.
1094 zoffadmin 2 * <p>
1095 zoffadmin 4 * The usefulness of this method is debatable as a <code>StackOverflowException</code> is a recoverable error that can be easily caught. The use of this method to pre-detect and avoid a stack overflow may save some memory and processing resources in certain circumstances, but the cost of calling this method to check every segment or document will very often exceed any benefit.
1096 zoffadmin 2 * <p>
1097 zoffadmin 4 * It is up to the application developer to determine what return value constitutes an unreasonable level of nesting given the stack space allocated to the application and other factors.
1098 zoffadmin 2 * <p>
1099 zoffadmin 4 * Note that the return value is an approximation only and is usually greater than the actual maximum element depth that would be reported by calling the {@link Element#getDepth()} method on the most nested element.
1100     *
1101 zoffadmin 2 * @return an indication of the maximum depth of nested elements within this segment.
1102     */
1103 zoffadmin 4 public int getMaxDepthIndicator()
1104     {
1105     int maxDepth = 0;
1106     int depth = 0;
1107     for (Tag tag : getAllTags())
1108     {
1109     if (tag instanceof StartTag)
1110     {
1111     StartTag startTag = (StartTag) tag;
1112     if (startTag.getStartTagType().getCorrespondingEndTagType() == null) continue;
1113 zoffadmin 2 if (HTMLElements.getEndTagForbiddenElementNames().contains(startTag.getName())) continue;
1114     if (startTag.isEmptyElementTag()) continue;
1115     depth++;
1116 zoffadmin 4 if (depth > maxDepth) maxDepth++;
1117     }
1118     else
1119     {
1120 zoffadmin 2 depth--;
1121     }
1122     }
1123     return maxDepth;
1124     }
1125    
1126     /**
1127     * Indicates whether the specified character is <a target="_blank" href="http://www.w3.org/TR/html401/struct/text.html#h-9.1">white space</a>.
1128     * <p>
1129 zoffadmin 4 * The <a target="_blank" href="http://www.w3.org/TR/html401/struct/text.html#h-9.1">HTML 4.01 specification section 9.1</a> specifies the following white space characters:
1130 zoffadmin 2 * <ul>
1131 zoffadmin 4 * <li>space (U+0020)
1132     * <li>tab (U+0009)
1133     * <li>form feed (U+000C)
1134     * <li>line feed (U+000A)
1135     * <li>carriage return (U+000D)
1136     * <li>zero-width space (U+200B)
1137 zoffadmin 2 * </ul>
1138     * <p>
1139 zoffadmin 4 * Despite the explicit inclusion of the zero-width space in the HTML specification, Microsoft IE6 does not recognise them as white space and renders them as an unprintable character (empty square). Even zero-width spaces included using the numeric character reference <code>&amp;#x200B;</code> are rendered this way.
1140     *
1141     * @param ch
1142     * the character to test.
1143 zoffadmin 2 * @return <code>true</code> if the specified character is <a target="_blank" href="http://www.w3.org/TR/html401/struct/text.html#h-9.1">white space</a>, otherwise <code>false</code>.
1144     */
1145 zoffadmin 4 public static final boolean isWhiteSpace(final char ch)
1146     {
1147     for (char whiteSpaceChar : WHITESPACE)
1148     if (ch == whiteSpaceChar) return true;
1149 zoffadmin 2 return false;
1150     }
1151    
1152     /**
1153     * Returns a string representation of this object useful for debugging purposes.
1154 zoffadmin 4 *
1155 zoffadmin 2 * @return a string representation of this object useful for debugging purposes.
1156     */
1157 zoffadmin 4 public String getDebugInfo()
1158     {
1159     final StringBuilder sb = new StringBuilder(50);
1160 zoffadmin 2 sb.append('(');
1161     source.getRowColumnVector(begin).appendTo(sb);
1162     sb.append('-');
1163     source.getRowColumnVector(end).appendTo(sb);
1164     sb.append(')');
1165     return sb.toString();
1166     }
1167    
1168     /**
1169     * Returns the character at the specified index.
1170     * <p>
1171 zoffadmin 4 * This is logically equivalent to <code>toString().charAt(index)</code> for valid argument values <code>0 <= index < length()</code>.
1172 zoffadmin 2 * <p>
1173 zoffadmin 4 * However because this implementation works directly on the underlying document source string, it should not be assumed that an <code>IndexOutOfBoundsException</code> is thrown for an invalid argument value.
1174     *
1175     * @param index
1176     * the index of the character.
1177 zoffadmin 2 * @return the character at the specified index.
1178     */
1179 zoffadmin 4 public char charAt(final int index)
1180     {
1181     return source.charAt(begin + index);
1182 zoffadmin 2 }
1183    
1184     /**
1185     * Returns a new character sequence that is a subsequence of this sequence.
1186     * <p>
1187 zoffadmin 4 * This is logically equivalent to <code>toString().subSequence(beginIndex,endIndex)</code> for valid values of <code>beginIndex</code> and <code>endIndex</code>.
1188 zoffadmin 2 * <p>
1189 zoffadmin 4 * However because this implementation works directly on the underlying document source text, it should not be assumed that an <code>IndexOutOfBoundsException</code> is thrown for invalid argument values as described in the <code>String.subSequence(int,int)</code> method.
1190     *
1191     * @param beginIndex
1192     * the begin index, inclusive.
1193     * @param endIndex
1194     * the end index, exclusive.
1195 zoffadmin 2 * @return a new character sequence that is a subsequence of this sequence.
1196     */
1197 zoffadmin 4 public CharSequence subSequence(final int beginIndex, final int endIndex)
1198     {
1199     return source.subSequence(begin + beginIndex, begin + endIndex);
1200 zoffadmin 2 }
1201    
1202     /**
1203     * Collapses the {@linkplain #isWhiteSpace(char) white space} in the specified text.
1204     * All leading and trailing white space is omitted, and any sections of internal white space are replaced by a single space.
1205     */
1206 zoffadmin 4 static final StringBuilder appendCollapseWhiteSpace(final StringBuilder sb, final CharSequence text)
1207     {
1208     final int textLength = text.length();
1209     int i = 0;
1210     boolean lastWasWhiteSpace = false;
1211     while (true)
1212     {
1213     if (i >= textLength) return sb;
1214 zoffadmin 2 if (!isWhiteSpace(text.charAt(i))) break;
1215     i++;
1216     }
1217 zoffadmin 4 do
1218     {
1219     final char ch = text.charAt(i++);
1220     if (isWhiteSpace(ch))
1221     {
1222     lastWasWhiteSpace = true;
1223     }
1224     else
1225     {
1226     if (lastWasWhiteSpace)
1227     {
1228 zoffadmin 2 sb.append(' ');
1229 zoffadmin 4 lastWasWhiteSpace = false;
1230 zoffadmin 2 }
1231     sb.append(ch);
1232     }
1233 zoffadmin 4 }
1234     while (i < textLength);
1235 zoffadmin 2 return sb;
1236     }
1237    
1238 zoffadmin 4 static final Pattern getClassPattern(final String className)
1239     {
1240     return Pattern.compile(".*(\\s|^)" + className + "(\\s|$).*", Pattern.DOTALL);
1241 zoffadmin 2 }
1242    
1243 zoffadmin 4 private List<Element> getAllElements(final List<StartTag> startTags)
1244     {
1245 zoffadmin 2 if (startTags.isEmpty()) return Collections.emptyList();
1246 zoffadmin 4 final ArrayList<Element> elements = new ArrayList<Element>(startTags.size());
1247     for (StartTag startTag : startTags)
1248     {
1249     final Element element = startTag.getElement();
1250     if (element.end <= end) elements.add(element);
1251 zoffadmin 2 }
1252     return elements;
1253     }
1254    
1255 zoffadmin 4 private StartTag checkEnclosure(final StartTag startTag)
1256     {
1257     if (startTag == null || startTag.end > end) return null;
1258 zoffadmin 2 return startTag;
1259     }
1260    
1261 zoffadmin 4 private Tag checkTagEnclosure(final Tag tag)
1262     {
1263     if (tag == null || tag.end > end) return null;
1264 zoffadmin 2 return tag;
1265     }
1266    
1267 zoffadmin 4 private CharacterReference getNextCharacterReference(final int pos)
1268     {
1269     final CharacterReference characterReference = source.getNextCharacterReference(pos);
1270     if (characterReference == null || characterReference.end > end) return null;
1271 zoffadmin 2 return characterReference;
1272     }
1273     }

   
Visit the aagtl Website