/[aagtl_public1]/src/net/htmlparser/jericho/Segment.java
aagtl

Contents of /src/net/htmlparser/jericho/Segment.java

Parent Directory Parent Directory | Revision Log Revision Log


Revision 4 - (show annotations) (download)
Sat Aug 1 08:47:10 2015 UTC (8 years, 7 months ago) by zoffadmin
File size: 60940 byte(s)
1.0.35
1 // Jericho HTML Parser - Java based library for analysing and manipulating HTML
2 // Version 3.2
3 // Copyright (C) 2004-2009 Martin Jericho
4 // http://jericho.htmlparser.net/
5 //
6 // This library is free software; you can redistribute it and/or
7 // modify it under the terms of either one of the following licences:
8 //
9 // 1. The Eclipse Public License (EPL) version 1.0,
10 // included in this distribution in the file licence-epl-1.0.html
11 // or available at http://www.eclipse.org/legal/epl-v10.html
12 //
13 // 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
14 // included in this distribution in the file licence-lgpl-2.1.txt
15 // or available at http://www.gnu.org/licenses/lgpl.txt
16 //
17 // This library is distributed on an "AS IS" basis,
18 // WITHOUT WARRANTY OF ANY KIND, either express or implied.
19 // See the individual licence texts for more details.
20
21 package net.htmlparser.jericho;
22
23 import java.io.Writer;
24 import java.util.ArrayList;
25 import java.util.Collection;
26 import java.util.Collections;
27 import java.util.Iterator;
28 import java.util.List;
29 import java.util.regex.Pattern;
30
31 /**
32 * Represents a segment of a {@link Source} document.
33 * <p>
34 * Many of the <a href="Tag.html#TagSearchMethods">tag search methods</a> are defined in this class.
35 * <p>
36 * The <i>span</i> of a segment is defined by the combination of its begin and end character positions.
37 */
38 public class Segment implements Comparable<Segment>, CharSequence
39 {
40 final int begin;
41 final int end;
42 final Source source;
43
44 private static final char[] WHITESPACE = { ' ', '\n', '\r', '\t', '\f', '\u200B' }; // see comments in isWhiteSpace(char) method
45
46 /**
47 * Constructs a new <code>Segment</code> within the specified {@linkplain Source source} document with the specified begin and end character positions.
48 *
49 * @param source
50 * the {@link Source} document, must not be <code>null</code>.
51 * @param begin
52 * the character position in the source where this segment {@linkplain #getBegin() begins}, inclusive.
53 * @param end
54 * the character position in the source where this segment {@linkplain #getEnd() ends}, exclusive.
55 */
56 public Segment(final Source source, final int begin, final int end)
57 {
58 if (begin == -1 || end == -1 || begin > end) throw new IllegalArgumentException();
59 this.begin = begin;
60 this.end = end;
61 if (source == null) throw new IllegalArgumentException("source argument must not be null");
62 this.source = source;
63 }
64
65 // Only called from Source constructor
66 Segment(final int length)
67 {
68 begin = 0;
69 this.end = length;
70 source = (Source) this;
71 }
72
73 // Only used for creating dummy flag instances of this type (see Tag.NOT_CACHED and Element.NOT_CACHED)
74 Segment()
75 {
76 this(0, 0);
77 }
78
79 // Only used for creating dummy flag instances of this type (see Segment() constructor and StreamedSource.START_SEGMENT)
80 Segment(final int begin, final int end)
81 {
82 this.begin = begin;
83 this.end = end;
84 source = null;
85 }
86
87 /**
88 * Returns the {@link Source} document containing this segment.
89 * <p>
90 * If a {@link StreamedSource} is in use, this method throws an <code>UnsupportedOperationException</code>.
91 *
92 * @return the {@link Source} document containing this segment.
93 */
94 public final Source getSource()
95 {
96 if (source.isStreamed()) throw new UnsupportedOperationException("Source object is not available when using StreamedSource");
97 return source;
98 }
99
100 /**
101 * Returns the character position in the {@link Source} document at which this segment begins, inclusive.
102 * <p>
103 * Use the {@link Source#getRowColumnVector(int pos)} method to determine the row and column numbers corresponding to this character position.
104 *
105 * @return the character position in the {@link Source} document at which this segment begins, inclusive.
106 */
107 public final int getBegin()
108 {
109 return begin;
110 }
111
112 /**
113 * Returns the character position in the {@link Source} document immediately after the end of this segment.
114 * <p>
115 * The character at the position specified by this property is <b>not</b> included in the segment.
116 *
117 * @return the character position in the {@link Source} document immediately after the end of this segment.
118 * @see #getBegin()
119 */
120 public final int getEnd()
121 {
122 return end;
123 }
124
125 /**
126 * Compares the specified object with this <code>Segment</code> for equality.
127 * <p>
128 * Returns <code>true</code> if and only if the specified object is also a <code>Segment</code>, and both segments have the same {@link Source}, and the same begin and end positions.
129 *
130 * @param object
131 * the object to be compared for equality with this <code>Segment</code>.
132 * @return <code>true</code> if the specified object is equal to this <code>Segment</code>, otherwise <code>false</code>.
133 */
134 public final boolean equals(final Object object)
135 {
136 if (this == object) return true;
137 if (object == null || !(object instanceof Segment)) return false;
138 final Segment segment = (Segment) object;
139 return segment.begin == begin && segment.end == end && segment.source == source;
140 }
141
142 /**
143 * Returns a hash code value for the segment.
144 * <p>
145 * The current implementation returns the sum of the begin and end positions, although this is not guaranteed in future versions.
146 *
147 * @return a hash code value for the segment.
148 */
149 public int hashCode()
150 {
151 return begin + end;
152 }
153
154 /**
155 * Returns the length of the segment.
156 * This is defined as the number of characters between the begin and end positions.
157 *
158 * @return the length of the segment.
159 */
160 public int length()
161 {
162 return end - begin;
163 }
164
165 /**
166 * Indicates whether this <code>Segment</code> encloses the specified <code>Segment</code>.
167 * <p>
168 * This is the case if {@link #getBegin()}<code>&lt;=segment.</code>{@link #getBegin()}<code> &amp;&amp; </code>{@link #getEnd()}<code>&gt;=segment.</code>{@link #getEnd()}.
169 * <p>
170 * Note that a segment encloses itself.
171 *
172 * @param segment
173 * the segment to be tested for being enclosed by this segment.
174 * @return <code>true</code> if this <code>Segment</code> encloses the specified <code>Segment</code>, otherwise <code>false</code>.
175 */
176 public final boolean encloses(final Segment segment)
177 {
178 return begin <= segment.begin && end >= segment.end;
179 }
180
181 /**
182 * Indicates whether this segment encloses the specified character position in the source document.
183 * <p>
184 * This is the case if {@link #getBegin()}<code> &lt;= pos &lt; </code>{@link #getEnd()}.
185 *
186 * @param pos
187 * the position in the {@link Source} document.
188 * @return <code>true</code> if this segment encloses the specified character position in the source document, otherwise <code>false</code>.
189 */
190 public final boolean encloses(final int pos)
191 {
192 return begin <= pos && pos < end;
193 }
194
195 /**
196 * Returns the source text of this segment as a <code>String</code>.
197 * <p>
198 * The returned <code>String</code> is newly created with every call to this method, unless this segment is itself an instance of {@link Source}.
199 *
200 * @return the source text of this segment as a <code>String</code>.
201 */
202 public String toString()
203 {
204 return source.subSequence(begin, end).toString();
205 }
206
207 /**
208 * Performs a simple rendering of the HTML markup in this segment into text.
209 * <p>
210 * The output can be configured by setting any number of properties on the returned {@link Renderer} instance before {@linkplain Renderer#writeTo(Writer) obtaining its output}.
211 *
212 * @return an instance of {@link Renderer} based on this segment.
213 * @see #getTextExtractor()
214 */
215 public Renderer getRenderer()
216 {
217 return new Renderer(this);
218 }
219
220 /**
221 * Extracts the textual content from the HTML markup of this segment.
222 * <p>
223 * The output can be configured by setting properties on the returned {@link TextExtractor} instance before {@linkplain TextExtractor#writeTo(Writer) obtaining its output}.
224 * <p>
225 *
226 * @return an instance of {@link TextExtractor} based on this segment.
227 * @see #getRenderer()
228 */
229 public TextExtractor getTextExtractor()
230 {
231 return new TextExtractor(this);
232 }
233
234 /**
235 * Returns an iterator over every {@linkplain Tag tag}, {@linkplain CharacterReference character reference} and <a href="Source.html#PlainText">plain text</a> segment contained within this segment.
236 * <p>
237 * See the {@link Source#iterator()} method for a detailed description.
238 * <p>
239 * <dl>
240 * <dt>Example:</dt>
241 * <dd>
242 * <p>
243 * The following code demonstrates the typical usage of this method to make an exact copy of this segment to <code>writer</code> (assuming no server tags are present):
244 * </p>
245 *
246 * <pre>
247 * for (Iterator&lt;Segment&gt; nodeIterator = segment.getNoteIterator(); nodeIterator.hasNext();)
248 * {
249 * Segment nodeSegment = nodeIterator.next();
250 * if (nodeSegment instanceof Tag)
251 * {
252 * Tag tag = (Tag) nodeSegment;
253 * // HANDLE TAG
254 * // Uncomment the following line to ensure each tag is valid XML:
255 * // writer.write(tag.tidy()); continue;
256 * }
257 * else if (nodeSegment instanceof CharacterReference)
258 * {
259 * CharacterReference characterReference = (CharacterReference) nodeSegment;
260 * // HANDLE CHARACTER REFERENCE
261 * // Uncomment the following line to decode all character references instead of copying them verbatim:
262 * // characterReference.appendCharTo(writer); continue;
263 * }
264 * else
265 * {
266 * // HANDLE PLAIN TEXT
267 * }
268 * // unless specific handling has prevented getting to here, simply output the segment as is:
269 * writer.write(nodeSegment.toString());
270 * }
271 * </pre>
272 *
273 * </dd>
274 * </dl>
275 *
276 * @return an iterator over every {@linkplain Tag tag}, {@linkplain CharacterReference character reference} and <a href="Source.html#PlainText">plain text</a> segment contained within this segment.
277 */
278 public Iterator<Segment> getNodeIterator()
279 {
280 return new NodeIterator(this);
281 }
282
283 /**
284 * Returns a list of all {@link Tag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
285 * <p>
286 * The {@link Source#fullSequentialParse()} method should be called after construction of the {@link Source} object if this method is to be used on a large proportion of the source. It is called automatically if this method is called on the {@link Source} object itself.
287 * <p>
288 * See the {@link Tag} class documentation for more details about the behaviour of this method.
289 *
290 * @return a list of all {@link Tag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
291 */
292 public List<Tag> getAllTags()
293 {
294 return getAllTags(null);
295 }
296
297 /**
298 * Returns a list of all {@link Tag} objects of the specified {@linkplain TagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
299 * <p>
300 * See the {@link Tag} class documentation for more details about the behaviour of this method.
301 * <p>
302 * Specifying a <code>null</code> argument to the <code>tagType</code> parameter is equivalent to {@link #getAllTags()}.
303 *
304 * @param tagType
305 * the {@linkplain TagType type} of tags to get.
306 * @return a list of all {@link Tag} objects of the specified {@linkplain TagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
307 * @see #getAllStartTags(StartTagType)
308 */
309 public List<Tag> getAllTags(final TagType tagType)
310 {
311 Tag tag = checkTagEnclosure(Tag.getNextTag(source, begin, tagType));
312 if (tag == null) return Collections.emptyList();
313 final ArrayList<Tag> list = new ArrayList<Tag>();
314 do
315 {
316 list.add(tag);
317 tag = checkTagEnclosure(tag.getNextTag(tagType));
318 }
319 while (tag != null);
320 return list;
321 }
322
323 /**
324 * Returns a list of all {@link StartTag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
325 * <p>
326 * The {@link Source#fullSequentialParse()} method should be called after construction of the {@link Source} object if this method is to be used on a large proportion of the source. It is called automatically if this method is called on the {@link Source} object itself.
327 * <p>
328 * See the {@link Tag} class documentation for more details about the behaviour of this method.
329 *
330 * @return a list of all {@link StartTag} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
331 */
332 public List<StartTag> getAllStartTags()
333 {
334 StartTag startTag = checkEnclosure(StartTag.getNext(source, begin));
335 if (startTag == null) return Collections.emptyList();
336 final ArrayList<StartTag> list = new ArrayList<StartTag>();
337 do
338 {
339 list.add(startTag);
340 startTag = checkEnclosure(startTag.getNextStartTag());
341 }
342 while (startTag != null);
343 return list;
344 }
345
346 /**
347 * Returns a list of all {@link StartTag} objects of the specified {@linkplain StartTagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
348 * <p>
349 * See the {@link Tag} class documentation for more details about the behaviour of this method.
350 * <p>
351 * Specifying a <code>null</code> argument to the <code>startTagType</code> parameter is equivalent to {@link #getAllStartTags()}.
352 *
353 * @param startTagType
354 * the {@linkplain StartTagType type} of tags to get.
355 * @return a list of all {@link StartTag} objects of the specified {@linkplain StartTagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
356 */
357 public List<StartTag> getAllStartTags(final StartTagType startTagType)
358 {
359 if (startTagType == null) return getAllStartTags();
360 StartTag startTag = (StartTag) checkTagEnclosure(Tag.getNextTag(source, begin, startTagType));
361 if (startTag == null) return Collections.emptyList();
362 final ArrayList<StartTag> list = new ArrayList<StartTag>();
363 do
364 {
365 list.add(startTag);
366 startTag = (StartTag) checkTagEnclosure(startTag.getNextTag(startTagType));
367 }
368 while (startTag != null);
369 return list;
370 }
371
372 /**
373 * Returns a list of all {@linkplain StartTagType#NORMAL normal} {@link StartTag} objects with the specified {@linkplain StartTag#getName() name} that are {@linkplain #encloses(Segment) enclosed} by this segment.
374 * <p>
375 * See the {@link Tag} class documentation for more details about the behaviour of this method.
376 * <p>
377 * Specifying a <code>null</code> argument to the <code>name</code> parameter is equivalent to {@link #getAllStartTags()}, which may include non-{@linkplain StartTagType#NORMAL normal} start tags.
378 * <p>
379 * This method also returns {@linkplain Tag#isUnregistered() unregistered} tags if the specified name is not a valid {@linkplain Tag#isXMLName(CharSequence) XML tag name}.
380 *
381 * @param name
382 * the {@linkplain StartTag#getName() name} of the start tags to get.
383 * @return a list of all {@linkplain StartTagType#NORMAL normal} {@link StartTag} objects with the specified {@linkplain StartTag#getName() name} that are {@linkplain #encloses(Segment) enclosed} by this segment.
384 */
385 public List<StartTag> getAllStartTags(String name)
386 {
387 if (name == null) return getAllStartTags();
388 final boolean isXMLTagName = Tag.isXMLName(name);
389 StartTag startTag = checkEnclosure(StartTag.getNext(source, begin, name, StartTagType.NORMAL, isXMLTagName));
390 if (startTag == null) return Collections.emptyList();
391 final ArrayList<StartTag> list = new ArrayList<StartTag>();
392 do
393 {
394 list.add(startTag);
395 startTag = checkEnclosure(StartTag.getNext(source, startTag.begin + 1, name, StartTagType.NORMAL, isXMLTagName));
396 }
397 while (startTag != null);
398 return list;
399 }
400
401 /**
402 * Returns a list of all {@link StartTag} objects with the specified attribute name/value pair that are {@linkplain #encloses(Segment) enclosed} by this segment.
403 * <p>
404 * See the {@link Tag} class documentation for more details about the behaviour of this method.
405 *
406 * @param attributeName
407 * the attribute name (case insensitive) to search for, must not be <code>null</code>.
408 * @param value
409 * the value of the specified attribute to search for, must not be <code>null</code>.
410 * @param valueCaseSensitive
411 * specifies whether the attribute value matching is case sensitive.
412 * @return a list of all {@link StartTag} objects with the specified attribute name/value pair that are {@linkplain #encloses(Segment) enclosed} by this segment.
413 * @see #getAllStartTags(String attributeName, Pattern valueRegexPattern)
414 */
415 public List<StartTag> getAllStartTags(final String attributeName, final String value, final boolean valueCaseSensitive)
416 {
417 StartTag startTag = checkEnclosure(source.getNextStartTag(begin, attributeName, value, valueCaseSensitive));
418 if (startTag == null) return Collections.emptyList();
419 final ArrayList<StartTag> list = new ArrayList<StartTag>();
420 do
421 {
422 list.add(startTag);
423 startTag = checkEnclosure(source.getNextStartTag(startTag.begin + 1, attributeName, value, valueCaseSensitive));
424 }
425 while (startTag != null);
426 return list;
427 }
428
429 /**
430 * Returns a list of all {@link StartTag} objects with the specified attribute name and value pattern that are {@linkplain #encloses(Segment) enclosed} by this segment.
431 * <p>
432 * Specifying a <code>null</code> argument to the <code>valueRegexPattern</code> parameter performs the search on the attribute name only, without regard to the attribute value. This will also match an attribute that {@linkplain Attribute#hasValue() has no value} at all.
433 * <p>
434 * See the {@link Tag} class documentation for more details about the behaviour of this method.
435 *
436 * @param attributeName
437 * the attribute name (case insensitive) to search for, must not be <code>null</code>.
438 * @param valueRegexPattern
439 * the regular expression pattern that must match the attribute value, may be <code>null</code>.
440 * @return a list of all {@link StartTag} objects with the specified attribute name and value pattern that are {@linkplain #encloses(Segment) enclosed} by this segment.
441 * @see #getAllStartTags(String attributeName, String value, boolean valueCaseSensitive)
442 */
443 public List<StartTag> getAllStartTags(final String attributeName, final Pattern valueRegexPattern)
444 {
445 StartTag startTag = checkEnclosure(source.getNextStartTag(begin, attributeName, valueRegexPattern));
446 if (startTag == null) return Collections.emptyList();
447 final ArrayList<StartTag> list = new ArrayList<StartTag>();
448 do
449 {
450 list.add(startTag);
451 startTag = checkEnclosure(source.getNextStartTag(startTag.begin + 1, attributeName, valueRegexPattern));
452 }
453 while (startTag != null);
454 return list;
455 }
456
457 /**
458 * Returns a list of all {@link StartTag} objects with the specified class that are {@linkplain #encloses(Segment) enclosed} by this segment.
459 * <p>
460 * This matches start tags with a <code>class</code> attribute that contains the specified class name, either as an exact match or where the specified class name is one of multiple class names separated by white space in the attribute value.
461 * <p>
462 * See the {@link Tag} class documentation for more details about the behaviour of this method.
463 *
464 * @param className
465 * the class name (case sensitive) to search for, must not be <code>null</code>.
466 * @return a list of all {@link StartTag} objects with the specified class that are {@linkplain #encloses(Segment) enclosed} by this segment.
467 */
468 public List<StartTag> getAllStartTagsByClass(final String className)
469 {
470 return getAllStartTags("class", getClassPattern(className));
471 }
472
473 /**
474 * Returns a list of the immediate children of this segment in the document element hierarchy.
475 * <p>
476 * The returned list may include an element that extends beyond the end of this segment, as long as it begins within this segment.
477 * <p>
478 * An element found at the start of this segment is included in the list. Note however that if this segment <i>is</i> an {@link Element}, the overriding {@link Element#getChildElements()} method is called instead, which only returns the children of the element.
479 * <p>
480 * Calling <code>getChildElements()</code> on an <code>Element</code> is much more efficient than calling it on a <code>Segment</code>.
481 * <p>
482 * The objects in the list are all of type {@link Element}.
483 * <p>
484 * The {@link Source#fullSequentialParse()} method should be called after construction of the {@link Source} object if this method is to be used on a large proportion of the source. It is called automatically if this method is called on the {@link Source} object itself.
485 * <p>
486 * See the {@link Source#getChildElements()} method for more details.
487 *
488 * @return the a list of the immediate children of this segment in the document element hierarchy, guaranteed not <code>null</code>.
489 * @see Element#getParentElement()
490 */
491 public List<Element> getChildElements()
492 {
493 if (length() == 0) return Collections.emptyList();
494 List<Element> childElements = new ArrayList<Element>();
495 int pos = begin;
496 while (true)
497 {
498 final StartTag childStartTag = source.getNextStartTag(pos);
499 if (childStartTag == null || childStartTag.begin >= end) break;
500 if (!Config.IncludeServerTagsInElementHierarchy && childStartTag.getTagType().isServerTag())
501 {
502 pos = childStartTag.end;
503 continue;
504 }
505 final Element childElement = childStartTag.getElement();
506 childElements.add(childElement);
507 childElement.getChildElements();
508 pos = childElement.end;
509 }
510 return childElements;
511 }
512
513 /**
514 * Returns a list of all {@link Element} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
515 * <p>
516 * The {@link Source#fullSequentialParse()} method should be called after construction of the {@link Source} object if this method is to be used on a large proportion of the source. It is called automatically if this method is called on the {@link Source} object itself.
517 * <p>
518 * The elements returned correspond exactly with the start tags returned in the {@link #getAllStartTags()} method.
519 * <p>
520 * If this segment is itself an {@link Element}, the result includes this element in the list.
521 *
522 * @return a list of all {@link Element} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
523 */
524 public List<Element> getAllElements()
525 {
526 return getAllElements(getAllStartTags());
527 }
528
529 /**
530 * Returns a list of all {@link Element} objects with the specified name that are {@linkplain #encloses(Segment) enclosed} by this segment.
531 * <p>
532 * The elements returned correspond with the start tags returned in the {@link #getAllStartTags(String name)} method, except that elements which are not entirely enclosed by this segment are excluded.
533 * <p>
534 * Specifying a <code>null</code> argument to the <code>name</code> parameter is equivalent to {@link #getAllElements()}, which may include elements of non-{@linkplain StartTagType#NORMAL normal} tags.
535 * <p>
536 * This method also returns elements consisting of {@linkplain Tag#isUnregistered() unregistered} tags if the specified name is not a valid {@linkplain Tag#isXMLName(CharSequence) XML tag name}.
537 * <p>
538 * If this segment is itself an {@link Element} with the specified name, the result includes this element in the list.
539 *
540 * @param name
541 * the {@linkplain Element#getName() name} of the elements to get.
542 * @return a list of all {@link Element} objects with the specified name that are {@linkplain #encloses(Segment) enclosed} by this segment.
543 */
544 public List<Element> getAllElements(String name)
545 {
546 return getAllElements(getAllStartTags(name));
547 }
548
549 /**
550 * Returns a list of all {@link Element} objects with start tags of the specified {@linkplain StartTagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
551 * <p>
552 * The elements returned correspond with the start tags returned in the {@link #getAllTags(TagType)} method, except that elements which are not entirely enclosed by this segment are excluded.
553 * <p>
554 * If this segment is itself an {@link Element} with the specified type, the result includes this element in the list.
555 *
556 * @param startTagType
557 * the {@linkplain StartTagType type} of start tags to get, must not be <code>null</code>.
558 * @return a list of all {@link Element} objects with start tags of the specified {@linkplain StartTagType type} that are {@linkplain #encloses(Segment) enclosed} by this segment.
559 */
560 public List<Element> getAllElements(final StartTagType startTagType)
561 {
562 if (startTagType == null) throw new IllegalArgumentException("startTagType argument must not be null");
563 return getAllElements(getAllStartTags(startTagType));
564 }
565
566 /**
567 * Returns a list of all {@link Element} objects with the specified attribute name/value pair that are {@linkplain #encloses(Segment) enclosed} by this segment.
568 * <p>
569 * The elements returned correspond with the start tags returned in the {@link #getAllStartTags(String attributeName, String value, boolean valueCaseSensitive)} method, except that elements which are not entirely enclosed by this segment are excluded.
570 * <p>
571 * If this segment is itself an {@link Element} with the specified name/value pair, the result includes this element in the list.
572 *
573 * @param attributeName
574 * the attribute name (case insensitive) to search for, must not be <code>null</code>.
575 * @param value
576 * the value of the specified attribute to search for, must not be <code>null</code>.
577 * @param valueCaseSensitive
578 * specifies whether the attribute value matching is case sensitive.
579 * @return a list of all {@link Element} objects with the specified attribute name/value pair that are {@linkplain #encloses(Segment) enclosed} by this segment.
580 * @see #getAllElements(String attributeName, Pattern valueRegexPattern)
581 */
582 public List<Element> getAllElements(final String attributeName, final String value, final boolean valueCaseSensitive)
583 {
584 return getAllElements(getAllStartTags(attributeName, value, valueCaseSensitive));
585 }
586
587 /**
588 * Returns a list of all {@link Element} objects with the specified attribute name and value pattern that are {@linkplain #encloses(Segment) enclosed} by this segment.
589 * <p>
590 * The elements returned correspond with the start tags returned in the {@link #getAllStartTags(String attributeName, Pattern valueRegexPattern)} method, except that elements which are not entirely enclosed by this segment are excluded.
591 * <p>
592 * Specifying a <code>null</code> argument to the <code>valueRegexPattern</code> parameter performs the search on the attribute name only, without regard to the attribute value. This will also match an attribute that {@linkplain Attribute#hasValue() has no value} at all.
593 * <p>
594 * If this segment is itself an {@link Element} with the specified attribute name and value pattern, the result includes this element in the list.
595 *
596 * @param attributeName
597 * the attribute name (case insensitive) to search for, must not be <code>null</code>.
598 * @param valueRegexPattern
599 * the regular expression pattern that must match the attribute value, may be <code>null</code>.
600 * @return a list of all {@link Element} objects with the specified attribute name and value pattern that are {@linkplain #encloses(Segment) enclosed} by this segment.
601 * @see #getAllElements(String attributeName, String value, boolean valueCaseSensitive)
602 */
603 public List<Element> getAllElements(final String attributeName, final Pattern valueRegexPattern)
604 {
605 return getAllElements(getAllStartTags(attributeName, valueRegexPattern));
606 }
607
608 /**
609 * Returns a list of all {@link Element} objects with the specified class that are {@linkplain #encloses(Segment) enclosed} by this segment.
610 * <p>
611 * This matches elements with a <code>class</code> attribute that contains the specified class name, either as an exact match or where the specified class name is one of multiple class names separated by white space in the attribute value.
612 * <p>
613 * The elements returned correspond with the start tags returned in the {@link #getAllStartTagsByClass(String className)} method, except that elements which are not entirely enclosed by this segment are excluded.
614 * <p>
615 * If this segment is itself an {@link Element} with the specified class, the result includes this element in the list.
616 *
617 * @param className
618 * the class name (case sensitive) to search for, must not be <code>null</code>.
619 * @return a list of all {@link Element} objects with the specified class that are {@linkplain #encloses(Segment) enclosed} by this segment.
620 */
621 public List<Element> getAllElementsByClass(final String className)
622 {
623 return getAllElements(getAllStartTagsByClass(className));
624 }
625
626 /**
627 * Returns a list of all {@link CharacterReference} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
628 *
629 * @return a list of all {@link CharacterReference} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
630 */
631 public List<CharacterReference> getAllCharacterReferences()
632 {
633 CharacterReference characterReference = getNextCharacterReference(begin);
634 if (characterReference == null) return Collections.emptyList();
635 final ArrayList<CharacterReference> list = new ArrayList<CharacterReference>();
636 do
637 {
638 list.add(characterReference);
639 characterReference = getNextCharacterReference(characterReference.end);
640 }
641 while (characterReference != null);
642 return list;
643 }
644
645 /**
646 * Returns a list of all {@linkplain Attribute attributes} {@linkplain #encloses(Segment) enclosed} by this segment that have <a target="_blank" href="http://en.wikipedia.org/wiki/URI">URI</a> values.
647 * <p>
648 * According to the <a target="_blank" href="http://www.w3.org/TR/html401/">HTML 4.01 specification</a>, the following attributes have URI values:
649 * <table class="bordered" cellspacing="0">
650 * <tr>
651 * <th>HTML element name
652 * <th>Attribute name
653 * <tr>
654 * <td>{@link HTMLElementName#A A}
655 * <td>href
656 * <tr>
657 * <td>{@link HTMLElementName#APPLET APPLET}
658 * <td>codebase
659 * <tr>
660 * <td>{@link HTMLElementName#AREA AREA}
661 * <td>href
662 * <tr>
663 * <td>{@link HTMLElementName#BASE BASE}
664 * <td>href
665 * <tr>
666 * <td>{@link HTMLElementName#BLOCKQUOTE BLOCKQUOTE}
667 * <td>cite
668 * <tr>
669 * <td>{@link HTMLElementName#BODY BODY}
670 * <td>background
671 * <tr>
672 * <td>{@link HTMLElementName#FORM FORM}
673 * <td>action
674 * <tr>
675 * <td>{@link HTMLElementName#FRAME FRAME}
676 * <td>longdesc
677 * <tr>
678 * <td>{@link HTMLElementName#FRAME FRAME}
679 * <td>src
680 * <tr>
681 * <td>{@link HTMLElementName#DEL DEL}
682 * <td>cite
683 * <tr>
684 * <td>{@link HTMLElementName#HEAD HEAD}
685 * <td>profile
686 * <tr>
687 * <td>{@link HTMLElementName#IFRAME IFRAME}
688 * <td>longdesc
689 * <tr>
690 * <td>{@link HTMLElementName#IFRAME IFRAME}
691 * <td>src
692 * <tr>
693 * <td>{@link HTMLElementName#IMG IMG}
694 * <td>longdesc
695 * <tr>
696 * <td>{@link HTMLElementName#IMG IMG}
697 * <td>src
698 * <tr>
699 * <td>{@link HTMLElementName#IMG IMG}
700 * <td>usemap
701 * <tr>
702 * <td>{@link HTMLElementName#INPUT INPUT}
703 * <td>src
704 * <tr>
705 * <td>{@link HTMLElementName#INPUT INPUT}
706 * <td>usemap
707 * <tr>
708 * <td>{@link HTMLElementName#INS INS}
709 * <td>cite
710 * <tr>
711 * <td>{@link HTMLElementName#LINK LINK}
712 * <td>href
713 * <tr>
714 * <td>{@link HTMLElementName#OBJECT OBJECT}
715 * <td>classid
716 * <tr>
717 * <td>{@link HTMLElementName#OBJECT OBJECT}
718 * <td>codebase
719 * <tr>
720 * <td>{@link HTMLElementName#OBJECT OBJECT}
721 * <td>data
722 * <tr>
723 * <td>{@link HTMLElementName#OBJECT OBJECT}
724 * <td>usemap
725 * <tr>
726 * <td>{@link HTMLElementName#Q Q}
727 * <td>cite
728 * <tr>
729 * <td>{@link HTMLElementName#SCRIPT SCRIPT}
730 * <td>src
731 * </table>
732 * <p>
733 * Attributes from other elements may also be returned if the attribute name matches one of those in the list above.
734 * <p>
735 * This method is often used in conjunction with the {@link #getStyleURISegments()} method in order to find all URIs in a document.
736 * <p>
737 * The attributes are returned in order of appearance.
738 *
739 * @return a list of all {@linkplain Attribute attributes} {@linkplain #encloses(Segment) enclosed} by this segment that have <a target="_blank" href="http://en.wikipedia.org/wiki/URI">URI</a> values.
740 * @see #getStyleURISegments()
741 */
742 public List<Attribute> getURIAttributes()
743 {
744 return URIAttributes.getList(this);
745 }
746
747 /**
748 * Returns a list of all <a target="_blank" href="http://en.wikipedia.org/wiki/URI">URI</a> {@linkplain Segment segments} inside <code>style</code> attribute values {@linkplain #encloses(Segment) enclosed} by this segment.
749 * <p>
750 * If this segment does not contain any tags, the entire segment is assumed to be a <code>style</code> attribute value.
751 * <p>
752 * The URI segments are found by searching the <code>style</code> attribute values for the functional notation "<code>url()</code>" as described in <a target="_blank" href="http://www.w3.org/TR/CSS2/syndata.html#uri">section 4.3.4 of the CSS2 specification</a>.
753 * <p>
754 * The segments are returned in order of appearance.
755 *
756 * @return a list of all <a target="_blank" href="http://en.wikipedia.org/wiki/URI">URI</a> {@linkplain Segment segments} inside <code>style</code> attribute values {@linkplain #encloses(Segment) enclosed} by this segment.
757 * @see #getURIAttributes()
758 */
759 public List<Segment> getStyleURISegments()
760 {
761 return URIAttributes.getStyleURISegments(this);
762 }
763
764 /**
765 * Returns the first {@link StartTag} {@linkplain #encloses(Segment) enclosed} by this segment.
766 * <p>
767 * This is functionally equivalent to {@link #getAllStartTags()}<code>.iterator().next()</code>, but does not search beyond the first start tag and returns <code>null</code> if no such start tag exists.
768 *
769 * @return the first {@link StartTag} {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
770 */
771 public final StartTag getFirstStartTag()
772 {
773 return checkEnclosure(source.getNextStartTag(begin));
774 }
775
776 /**
777 * Returns the first {@link StartTag} of the specified {@linkplain StartTagType type} {@linkplain #encloses(Segment) enclosed} by this segment.
778 * <p>
779 * This is functionally equivalent to {@link #getAllStartTags(StartTagType) getAllStartTags(startTagType)}<code>.iterator().next()</code>, but does not search beyond the first start tag and returns <code>null</code> if no such start tag exists.
780 *
781 * @param startTagType
782 * the <code>StartTagType</code> to search for.
783 * @return the first {@link StartTag} of the specified {@linkplain StartTagType type} {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
784 */
785 public final StartTag getFirstStartTag(StartTagType startTagType)
786 {
787 return checkEnclosure(source.getNextStartTag(begin, startTagType));
788 }
789
790 /**
791 * Returns the first {@linkplain StartTagType#NORMAL normal} {@link StartTag} {@linkplain #encloses(Segment) enclosed} by this segment.
792 * <p>
793 * This is functionally equivalent to {@link #getAllStartTags(String) getAllStartTags(name)}<code>.iterator().next()</code>, but does not search beyond the first start tag and returns <code>null</code> if no such start tag exists.
794 * <p>
795 * Specifying a <code>null</code> argument to the <code>name</code> parameter is equivalent to {@link #getFirstStartTag()}.
796 *
797 * @param name
798 * the {@linkplain StartTag#getName() name} of the start tag to search for, may be <code>null</code>.
799 * @return the first {@linkplain StartTagType#NORMAL normal} {@link StartTag} {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
800 */
801 public final StartTag getFirstStartTag(String name)
802 {
803 return checkEnclosure(source.getNextStartTag(begin, name));
804 }
805
806 /**
807 * Returns the first {@link StartTag} with the specified attribute name/value pair {@linkplain #encloses(Segment) enclosed} by this segment.
808 * <p>
809 * This is functionally equivalent to {@link #getAllStartTags(String,String,boolean) getAllStartTags(attributeName,value,valueCaseSensitive)}<code>.iterator().next()</code>, but does not search beyond the first start tag and returns <code>null</code> if no such start tag exists.
810 *
811 * @param attributeName
812 * the attribute name (case insensitive) to search for, must not be <code>null</code>.
813 * @param value
814 * the value of the specified attribute to search for, must not be <code>null</code>.
815 * @param valueCaseSensitive
816 * specifies whether the attribute value matching is case sensitive.
817 * @return the first {@link StartTag} with the specified attribute name/value pair {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
818 * @see #getFirstStartTag(String attributeName, Pattern valueRegexPattern)
819 */
820 public final StartTag getFirstStartTag(String attributeName, String value, boolean valueCaseSensitive)
821 {
822 return checkEnclosure(source.getNextStartTag(begin, attributeName, value, valueCaseSensitive));
823 }
824
825 /**
826 * Returns the first {@link StartTag} with the specified attribute name and value pattern that is {@linkplain #encloses(Segment) enclosed} by this segment.
827 * <p>
828 * This is functionally equivalent to {@link #getAllStartTags(String,Pattern) getAllStartTags(attributeName,valueRegexPattern)}<code>.iterator().next()</code>, but does not search beyond the first start tag and returns <code>null</code> if no such start tag exists.
829 *
830 * @param attributeName
831 * the attribute name (case insensitive) to search for, must not be <code>null</code>.
832 * @param valueRegexPattern
833 * the regular expression pattern that must match the attribute value, may be <code>null</code>.
834 * @return the first {@link StartTag} with the specified attribute name and value pattern that is {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
835 * @see #getFirstStartTag(String attributeName, String value, boolean valueCaseSensitive)
836 */
837 public final StartTag getFirstStartTag(final String attributeName, final Pattern valueRegexPattern)
838 {
839 return checkEnclosure(source.getNextStartTag(begin, attributeName, valueRegexPattern));
840 }
841
842 /**
843 * Returns the first {@link StartTag} with the specified class that is {@linkplain #encloses(Segment) enclosed} by this segment.
844 * <p>
845 * This is functionally equivalent to {@link #getAllStartTagsByClass(String) getAllStartTagsByClass(className)}<code>.iterator().next()</code>, but does not search beyond the first start tag and returns <code>null</code> if no such start tag exists.
846 *
847 * @param className
848 * the class name (case sensitive) to search for, must not be <code>null</code>.
849 * @return the first {@link StartTag} with the specified class that is {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
850 */
851 public final StartTag getFirstStartTagByClass(final String className)
852 {
853 return checkEnclosure(source.getNextStartTagByClass(begin, className));
854 }
855
856 /**
857 * Returns the first {@link Element} {@linkplain #encloses(Segment) enclosed} by this segment.
858 * <p>
859 * This is functionally equivalent to {@link #getAllElements()}<code>.iterator().next()</code>, but does not search beyond the first enclosed element and returns <code>null</code> if no such element exists.
860 * <p>
861 * If this segment is itself an {@link Element}, this element is returned, not the first child element.
862 *
863 * @return the first {@link Element} {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
864 */
865 public final Element getFirstElement()
866 {
867 StartTag startTag = checkEnclosure(StartTag.getNext(source, begin));
868 while (startTag != null)
869 {
870 final Element element = startTag.getElement();
871 if (element.end <= end) return element;
872 startTag = checkEnclosure(startTag.getNextStartTag());
873 }
874 return null;
875 }
876
877 /**
878 * Returns the first {@linkplain StartTagType#NORMAL normal} {@link Element} with the specified {@linkplain Element#getName() name} {@linkplain #encloses(Segment) enclosed} by this segment.
879 * <p>
880 * This is functionally equivalent to {@link #getAllElements(String) getAllElements(name)}<code>.iterator().next()</code>, but does not search beyond the first enclosed element and returns <code>null</code> if no such element exists.
881 * <p>
882 * Specifying a <code>null</code> argument to the <code>name</code> parameter is equivalent to {@link #getFirstElement()}.
883 * <p>
884 * If this segment is itself an {@link Element} with the specified name, this element is returned.
885 *
886 * @param name
887 * the {@linkplain Element#getName() name} of the element to search for.
888 * @return the first {@linkplain StartTagType#NORMAL normal} {@link Element} with the specified {@linkplain Element#getName() name} {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
889 */
890 public final Element getFirstElement(String name)
891 {
892 if (name == null) return getFirstElement();
893 final boolean isXMLTagName = Tag.isXMLName(name);
894 StartTag startTag = checkEnclosure(StartTag.getNext(source, begin, name, StartTagType.NORMAL, isXMLTagName));
895 while (startTag != null)
896 {
897 final Element element = startTag.getElement();
898 if (element.end <= end) return element;
899 startTag = checkEnclosure(StartTag.getNext(source, startTag.begin + 1, name, StartTagType.NORMAL, isXMLTagName));
900 }
901 return null;
902 }
903
904 /**
905 * Returns the first {@link Element} with the specified attribute name/value pair {@linkplain #encloses(Segment) enclosed} by this segment.
906 * <p>
907 * This is functionally equivalent to {@link #getAllElements(String,String,boolean) getAllElements(attributeName,value,valueCaseSensitive)}<code>.iterator().next()</code>, but does not search beyond the first enclosed element and returns <code>null</code> if no such element exists.
908 * <p>
909 * If this segment is itself an {@link Element} with the specified attribute name/value pair, this element is returned.
910 *
911 * @param attributeName
912 * the attribute name (case insensitive) to search for, must not be <code>null</code>.
913 * @param value
914 * the value of the specified attribute to search for, must not be <code>null</code>.
915 * @param valueCaseSensitive
916 * specifies whether the attribute value matching is case sensitive.
917 * @return the first {@link Element} with the specified attribute name/value pair {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
918 * @see #getFirstElement(String attributeName, Pattern valueRegexPattern)
919 */
920 public final Element getFirstElement(String attributeName, String value, boolean valueCaseSensitive)
921 {
922 StartTag startTag = checkEnclosure(source.getNextStartTag(begin, attributeName, value, valueCaseSensitive));
923 while (startTag != null)
924 {
925 final Element element = startTag.getElement();
926 if (element.end <= end) return element;
927 startTag = checkEnclosure(source.getNextStartTag(startTag.begin + 1, attributeName, value, valueCaseSensitive));
928 }
929 return null;
930 }
931
932 /**
933 * Returns the first {@link Element} with the specified attribute name and value pattern that is {@linkplain #encloses(Segment) enclosed} by this segment.
934 * <p>
935 * This is functionally equivalent to {@link #getAllElements(String,Pattern) getAllElements(attributeName,valueRegexPattern)}<code>.iterator().next()</code>, but does not search beyond the first enclosed element and returns <code>null</code> if no such element exists.
936 * <p>
937 * If this segment is itself an {@link Element} with the specified attribute name and value pattern, this element is returned.
938 *
939 * @param attributeName
940 * the attribute name (case insensitive) to search for, must not be <code>null</code>.
941 * @param valueRegexPattern
942 * the regular expression pattern that must match the attribute value, may be <code>null</code>.
943 * @return the first {@link Element} with the specified attribute name and value pattern that is {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
944 * @see #getFirstElement(String attributeName, String value, boolean valueCaseSensitive)
945 */
946 public final Element getFirstElement(final String attributeName, final Pattern valueRegexPattern)
947 {
948 StartTag startTag = checkEnclosure(source.getNextStartTag(begin, attributeName, valueRegexPattern));
949 while (startTag != null)
950 {
951 final Element element = startTag.getElement();
952 if (element.end <= end) return element;
953 startTag = checkEnclosure(source.getNextStartTag(startTag.begin + 1, attributeName, valueRegexPattern));
954 }
955 return null;
956 }
957
958 /**
959 * Returns the first {@link Element} with the specified class that is {@linkplain #encloses(Segment) enclosed} by this segment.
960 * <p>
961 * This is functionally equivalent to {@link #getAllElementsByClass(String) getAllElementsByClass(className)}<code>.iterator().next()</code>, but does not search beyond the first enclosed element and returns <code>null</code> if no such element exists.
962 * <p>
963 * If this segment is itself an {@link Element} with the specified class, this element is returned.
964 *
965 * @param className
966 * the class name (case sensitive) to search for, must not be <code>null</code>.
967 * @return the first {@link Element} with the specified class that is {@linkplain #encloses(Segment) enclosed} by this segment, or <code>null</code> if none exists.
968 */
969 public final Element getFirstElementByClass(final String className)
970 {
971 StartTag startTag = checkEnclosure(source.getNextStartTagByClass(begin, className));
972 while (startTag != null)
973 {
974 final Element element = startTag.getElement();
975 if (element.end <= end) return element;
976 startTag = checkEnclosure(source.getNextStartTagByClass(startTag.begin + 1, className));
977 }
978 return null;
979 }
980
981 /**
982 * Returns a list of the {@link FormControl} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
983 *
984 * @return a list of the {@link FormControl} objects that are {@linkplain #encloses(Segment) enclosed} by this segment.
985 */
986 public List<FormControl> getFormControls()
987 {
988 return FormControl.getAll(this);
989 }
990
991 /**
992 * Returns the {@link FormFields} object representing all form fields that are {@linkplain #encloses(Segment) enclosed} by this segment.
993 * <p>
994 * This is equivalent to {@link FormFields#FormFields(Collection) new FormFields}<code>(</code>{@link #getFormControls()}<code>)</code>.
995 *
996 * @return the {@link FormFields} object representing all form fields that are {@linkplain #encloses(Segment) enclosed} by this segment.
997 * @see #getFormControls()
998 */
999 public FormFields getFormFields()
1000 {
1001 return new FormFields(getFormControls());
1002 }
1003
1004 /**
1005 * Parses any {@link Attributes} within this segment.
1006 * This method is only used in the unusual situation where attributes exist outside of a start tag.
1007 * The {@link StartTag#getAttributes()} method should be used in normal situations.
1008 * <p>
1009 * This is equivalent to <code>source.</code>{@link Source#parseAttributes(int,int) parseAttributes}<code>(</code>{@link #getBegin()}<code>,</code>{@link #getEnd()}<code>)</code>.
1010 *
1011 * @return the {@link Attributes} within this segment, or <code>null</code> if too many errors occur while parsing.
1012 */
1013 public Attributes parseAttributes()
1014 {
1015 return source.parseAttributes(begin, end);
1016 }
1017
1018 /**
1019 * Causes the this segment to be ignored when parsing.
1020 * <p>
1021 * Ignored segments are treated as blank spaces by the parsing mechanism, but are included as normal text in all other functions.
1022 * <p>
1023 * This method was originally the only means of preventing {@linkplain TagType#isServerTag() server tags} located inside {@linkplain StartTagType#NORMAL normal} tags from interfering with the parsing of the tags (such as where an {@linkplain Attribute attribute} of a normal tag uses a server tag to dynamically set its value), as well as preventing non-server tags from being recognised inside server tags.
1024 * <p>
1025 * It is not necessary to use this method to ignore {@linkplain TagType#isServerTag() server tags} located inside normal tags, as the attributes parser automatically ignores any server tags.
1026 * <p>
1027 * It is not necessary to use this method to ignore non-server tags inside server tags, or the contents of {@link HTMLElementName#SCRIPT SCRIPT} elements, as the parser does this automatically when performing a {@linkplain Source#fullSequentialParse() full sequential parse}.
1028 * <p>
1029 * This leaves only very few scenarios where calling this method still provides a significant benefit.
1030 * <p>
1031 * One such case is where XML-style server tags are used inside {@linkplain StartTagType#NORMAL normal} tags. Here is an example using an XML-style JSP tag: <blockquote class="code"><code>&lt;a href="&lt;i18n:resource path="/Portal"/&gt;?BACK=TRUE"&gt;back&lt;/a&gt;</code></blockquote> The first double-quote of <code>"/Portal"</code> will be interpreted as the end quote for the <code>href</code> attribute, as there is no way for the parser to recognise the <code>il8n:resource</code> element
1032 * as a server tag. Such use of XML-style server tags inside {@linkplain StartTagType#NORMAL normal} tags is generally seen as bad practice, but it is nevertheless valid JSP. The only way to ensure that this library is able to parse the normal tag surrounding it is to find these server tags first and call the <code>ignoreWhenParsing</code> method to ignore them before parsing the rest of the document.
1033 * <p>
1034 * It is important to understand the difference between ignoring the segment when parsing and removing the segment completely. Any text inside a segment that is ignored when parsing is treated by most functions as content, and as such is included in the output of tools such as {@link TextExtractor} and {@link Renderer}.
1035 * <p>
1036 * To remove segments completely, create an {@link OutputDocument} and call its {@link OutputDocument#remove(Segment) remove(Segment)} or {@link OutputDocument#replaceWithSpaces(int,int) replaceWithSpaces(int begin, int end)} method for each segment. Then create a new source document using {@link Source#Source(CharSequence) new Source(outputDocument.toString())} and perform the desired operations on this new source object.
1037 * <p>
1038 * Calling this method after the {@link Source#fullSequentialParse()} method has been called is not permitted and throws an <code>IllegalStateException</code>.
1039 * <p>
1040 * Any tags appearing in this segment that are found before this method is called will remain in the {@linkplain Source#getCacheDebugInfo() tag cache}, and so will continue to be found by the <a href="Tag.html#TagSearchMethods">tag search methods</a>. If this is undesirable, the {@link Source#clearCache()} method can be called to remove them from the cache. Calling the {@link Source#fullSequentialParse()} method after this method clears the cache automatically.
1041 * <p>
1042 * For best performance, this method should be called on all segments that need to be ignored without calling any of the <a href="Tag.html#TagSearchMethods">tag search methods</a> in between.
1043 *
1044 * @see Source#ignoreWhenParsing(Collection segments)
1045 */
1046 public void ignoreWhenParsing()
1047 {
1048 source.ignoreWhenParsing(begin, end);
1049 }
1050
1051 /**
1052 * Compares this <code>Segment</code> object to another object.
1053 * <p>
1054 * If the argument is not a <code>Segment</code>, a <code>ClassCastException</code> is thrown.
1055 * <p>
1056 * A segment is considered to be before another segment if its begin position is earlier, or in the case that both segments begin at the same position, its end position is earlier.
1057 * <p>
1058 * Segments that begin and end at the same position are considered equal for the purposes of this comparison, even if they relate to different source documents.
1059 * <p>
1060 * Note: this class has a natural ordering that is inconsistent with equals. This means that this method may return zero in some cases where calling the {@link #equals(Object)} method with the same argument returns <code>false</code>.
1061 *
1062 * @param segment
1063 * the segment to be compared
1064 * @return a negative integer, zero, or a positive integer as this segment is before, equal to, or after the specified segment.
1065 * @throws ClassCastException
1066 * if the argument is not a <code>Segment</code>
1067 */
1068 public int compareTo(final Segment segment)
1069 {
1070 if (this == segment) return 0;
1071 if (begin < segment.begin) return -1;
1072 if (begin > segment.begin) return 1;
1073 if (end < segment.end) return -1;
1074 if (end > segment.end) return 1;
1075 return 0;
1076 }
1077
1078 /**
1079 * Indicates whether this segment consists entirely of {@linkplain #isWhiteSpace(char) white space}.
1080 *
1081 * @return <code>true</code> if this segment consists entirely of {@linkplain #isWhiteSpace(char) white space}, otherwise <code>false</code>.
1082 */
1083 public final boolean isWhiteSpace()
1084 {
1085 for (int i = begin; i < end; i++)
1086 if (!isWhiteSpace(source.charAt(i))) return false;
1087 return true;
1088 }
1089
1090 /**
1091 * Returns an indication of the maximum depth of nested elements within this segment.
1092 * <p>
1093 * A high return value can indicate that the segment contains a large number of incorrectly nested tags that could result in a <code>StackOverflowException</code> if its content is parsed.
1094 * <p>
1095 * The usefulness of this method is debatable as a <code>StackOverflowException</code> is a recoverable error that can be easily caught. The use of this method to pre-detect and avoid a stack overflow may save some memory and processing resources in certain circumstances, but the cost of calling this method to check every segment or document will very often exceed any benefit.
1096 * <p>
1097 * It is up to the application developer to determine what return value constitutes an unreasonable level of nesting given the stack space allocated to the application and other factors.
1098 * <p>
1099 * Note that the return value is an approximation only and is usually greater than the actual maximum element depth that would be reported by calling the {@link Element#getDepth()} method on the most nested element.
1100 *
1101 * @return an indication of the maximum depth of nested elements within this segment.
1102 */
1103 public int getMaxDepthIndicator()
1104 {
1105 int maxDepth = 0;
1106 int depth = 0;
1107 for (Tag tag : getAllTags())
1108 {
1109 if (tag instanceof StartTag)
1110 {
1111 StartTag startTag = (StartTag) tag;
1112 if (startTag.getStartTagType().getCorrespondingEndTagType() == null) continue;
1113 if (HTMLElements.getEndTagForbiddenElementNames().contains(startTag.getName())) continue;
1114 if (startTag.isEmptyElementTag()) continue;
1115 depth++;
1116 if (depth > maxDepth) maxDepth++;
1117 }
1118 else
1119 {
1120 depth--;
1121 }
1122 }
1123 return maxDepth;
1124 }
1125
1126 /**
1127 * Indicates whether the specified character is <a target="_blank" href="http://www.w3.org/TR/html401/struct/text.html#h-9.1">white space</a>.
1128 * <p>
1129 * The <a target="_blank" href="http://www.w3.org/TR/html401/struct/text.html#h-9.1">HTML 4.01 specification section 9.1</a> specifies the following white space characters:
1130 * <ul>
1131 * <li>space (U+0020)
1132 * <li>tab (U+0009)
1133 * <li>form feed (U+000C)
1134 * <li>line feed (U+000A)
1135 * <li>carriage return (U+000D)
1136 * <li>zero-width space (U+200B)
1137 * </ul>
1138 * <p>
1139 * Despite the explicit inclusion of the zero-width space in the HTML specification, Microsoft IE6 does not recognise them as white space and renders them as an unprintable character (empty square). Even zero-width spaces included using the numeric character reference <code>&amp;#x200B;</code> are rendered this way.
1140 *
1141 * @param ch
1142 * the character to test.
1143 * @return <code>true</code> if the specified character is <a target="_blank" href="http://www.w3.org/TR/html401/struct/text.html#h-9.1">white space</a>, otherwise <code>false</code>.
1144 */
1145 public static final boolean isWhiteSpace(final char ch)
1146 {
1147 for (char whiteSpaceChar : WHITESPACE)
1148 if (ch == whiteSpaceChar) return true;
1149 return false;
1150 }
1151
1152 /**
1153 * Returns a string representation of this object useful for debugging purposes.
1154 *
1155 * @return a string representation of this object useful for debugging purposes.
1156 */
1157 public String getDebugInfo()
1158 {
1159 final StringBuilder sb = new StringBuilder(50);
1160 sb.append('(');
1161 source.getRowColumnVector(begin).appendTo(sb);
1162 sb.append('-');
1163 source.getRowColumnVector(end).appendTo(sb);
1164 sb.append(')');
1165 return sb.toString();
1166 }
1167
1168 /**
1169 * Returns the character at the specified index.
1170 * <p>
1171 * This is logically equivalent to <code>toString().charAt(index)</code> for valid argument values <code>0 <= index < length()</code>.
1172 * <p>
1173 * However because this implementation works directly on the underlying document source string, it should not be assumed that an <code>IndexOutOfBoundsException</code> is thrown for an invalid argument value.
1174 *
1175 * @param index
1176 * the index of the character.
1177 * @return the character at the specified index.
1178 */
1179 public char charAt(final int index)
1180 {
1181 return source.charAt(begin + index);
1182 }
1183
1184 /**
1185 * Returns a new character sequence that is a subsequence of this sequence.
1186 * <p>
1187 * This is logically equivalent to <code>toString().subSequence(beginIndex,endIndex)</code> for valid values of <code>beginIndex</code> and <code>endIndex</code>.
1188 * <p>
1189 * However because this implementation works directly on the underlying document source text, it should not be assumed that an <code>IndexOutOfBoundsException</code> is thrown for invalid argument values as described in the <code>String.subSequence(int,int)</code> method.
1190 *
1191 * @param beginIndex
1192 * the begin index, inclusive.
1193 * @param endIndex
1194 * the end index, exclusive.
1195 * @return a new character sequence that is a subsequence of this sequence.
1196 */
1197 public CharSequence subSequence(final int beginIndex, final int endIndex)
1198 {
1199 return source.subSequence(begin + beginIndex, begin + endIndex);
1200 }
1201
1202 /**
1203 * Collapses the {@linkplain #isWhiteSpace(char) white space} in the specified text.
1204 * All leading and trailing white space is omitted, and any sections of internal white space are replaced by a single space.
1205 */
1206 static final StringBuilder appendCollapseWhiteSpace(final StringBuilder sb, final CharSequence text)
1207 {
1208 final int textLength = text.length();
1209 int i = 0;
1210 boolean lastWasWhiteSpace = false;
1211 while (true)
1212 {
1213 if (i >= textLength) return sb;
1214 if (!isWhiteSpace(text.charAt(i))) break;
1215 i++;
1216 }
1217 do
1218 {
1219 final char ch = text.charAt(i++);
1220 if (isWhiteSpace(ch))
1221 {
1222 lastWasWhiteSpace = true;
1223 }
1224 else
1225 {
1226 if (lastWasWhiteSpace)
1227 {
1228 sb.append(' ');
1229 lastWasWhiteSpace = false;
1230 }
1231 sb.append(ch);
1232 }
1233 }
1234 while (i < textLength);
1235 return sb;
1236 }
1237
1238 static final Pattern getClassPattern(final String className)
1239 {
1240 return Pattern.compile(".*(\\s|^)" + className + "(\\s|$).*", Pattern.DOTALL);
1241 }
1242
1243 private List<Element> getAllElements(final List<StartTag> startTags)
1244 {
1245 if (startTags.isEmpty()) return Collections.emptyList();
1246 final ArrayList<Element> elements = new ArrayList<Element>(startTags.size());
1247 for (StartTag startTag : startTags)
1248 {
1249 final Element element = startTag.getElement();
1250 if (element.end <= end) elements.add(element);
1251 }
1252 return elements;
1253 }
1254
1255 private StartTag checkEnclosure(final StartTag startTag)
1256 {
1257 if (startTag == null || startTag.end > end) return null;
1258 return startTag;
1259 }
1260
1261 private Tag checkTagEnclosure(final Tag tag)
1262 {
1263 if (tag == null || tag.end > end) return null;
1264 return tag;
1265 }
1266
1267 private CharacterReference getNextCharacterReference(final int pos)
1268 {
1269 final CharacterReference characterReference = source.getNextCharacterReference(pos);
1270 if (characterReference == null || characterReference.end > end) return null;
1271 return characterReference;
1272 }
1273 }

   
Visit the aagtl Website