/[aagtl_public1]/src/net/htmlparser/jericho/StreamedSource.java
aagtl

Contents of /src/net/htmlparser/jericho/StreamedSource.java

Parent Directory Parent Directory | Revision Log Revision Log


Revision 2 - (show annotations) (download)
Sun Aug 5 13:48:36 2012 UTC (11 years, 7 months ago) by zoffadmin
File size: 39689 byte(s)
initial import of aagtl source code
1 // Jericho HTML Parser - Java based library for analysing and manipulating HTML
2 // Version 3.2
3 // Copyright (C) 2004-2009 Martin Jericho
4 // http://jericho.htmlparser.net/
5 //
6 // This library is free software; you can redistribute it and/or
7 // modify it under the terms of either one of the following licences:
8 //
9 // 1. The Eclipse Public License (EPL) version 1.0,
10 // included in this distribution in the file licence-epl-1.0.html
11 // or available at http://www.eclipse.org/legal/epl-v10.html
12 //
13 // 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
14 // included in this distribution in the file licence-lgpl-2.1.txt
15 // or available at http://www.gnu.org/licenses/lgpl.txt
16 //
17 // This library is distributed on an "AS IS" basis,
18 // WITHOUT WARRANTY OF ANY KIND, either express or implied.
19 // See the individual licence texts for more details.
20
21 package net.htmlparser.jericho;
22
23 import java.util.Iterator;
24 import java.util.NoSuchElementException;
25 import java.io.Closeable;
26 import java.io.Reader;
27 import java.io.Writer;
28 import java.io.IOException;
29 import java.io.InputStream;
30 import java.io.InputStreamReader;
31 import java.nio.CharBuffer;
32 import java.nio.BufferOverflowException;
33 import java.net.URL;
34 import java.net.URLConnection;
35
36 /**
37 * Represents a streamed source HTML document.
38 * <p>
39 * This class provides a means, via the {@link #iterator()} method, of sequentially parsing every {@linkplain Tag tag}, {@linkplain CharacterReference character reference}
40 * and <a href="#PlainText">plain text</a> segment contained within the source document using a minimum amount of memory.
41 * <p>
42 * In contrast, the standard {@link Source} class stores the entire source text in memory and caches every tag parsed,
43 * resulting in memory problems when attempting to parse very large files.
44 * <p>
45 * The {@link #iterator() iterator} parses and returns each segment as the source text is streamed in.
46 * Previous segments are discarded for garbage collection.
47 * Source documents up to 2GB in size can be processed, a limit which is imposed by the java language because of its use of the <code>int</code> data type to index string operations.
48 * <p>
49 * There is however a significant trade-off in functionality when using the <code>StreamedSource</code> class as opposed to the {@link Source} class.
50 * The {@link Tag#getElement()} method is not supported on tags that are returned by the iterator, nor are any methods that use the {@link Element} class in any way.
51 * The {@link Segment#getSource()} method is also not supported.
52 * <p>
53 * Most of the methods and constructors in this class mirror similarly named methods in the {@link Source} class where the same functionality is available.
54 * <p>
55 * See the description of the {@link #iterator()} method for a typical usage example of this class.
56 * <p>
57 * In contrast to a {@link Source} object, the <code>Reader</code> or <code>InputStream</code> specified in the constructor or created implicitly by the constructor
58 * remains open for the life of the <code>StreamedSource</code> object. If the stream is created internally, it is automatically {@linkplain #close() closed}
59 * when the end of the stream is reached or the <code>StreamedSource</code> object is {@linkplain #finalize() finalized}.
60 * However a <code>Reader</code> or <code>InputStream</code> that is specified directly in a constructor is never closed automatically, as it can not be assumed
61 * that the application has no further use for it. It is the user's responsibility to ensure it is closed in this case.
62 * Explicitly calling the {@link #close()} method on the <code>StreamedSource</code> object ensures that all resources used by it are closed, regardless of whether
63 * they were created internally or supplied externally.
64 * <p>
65 * The functionality provided by <code>StreamedSource</code> is similar to a <a target="_blank" href="http://en.wikipedia.org/wiki/StAX">StAX</a> parser,
66 * but with some important benefits:
67 * <ul>
68 * <li>
69 * The source document does not have to be valid XML. It can be plain HTML, can contain invalid syntax, undefined entities,
70 * incorrectly nested elements, {@linkplain TagType#isServerTag() server tags}, or anything else that is commonly found in
71 * "<a target="_blank" href="http://en.wikipedia.org/wiki/Tag_soup">tag soup</a>".
72 * <li>
73 * Every single syntactical construct in the source document's original text is included in the iterator, including the
74 * {@linkplain StartTagType#XML_DECLARATION XML declaration}, {@linkplain CharacterReference character references}, {@linkplain StartTagType#COMMENT comments},
75 * {@linkplain StartTagType#CDATA_SECTION CDATA sections} and {@linkplain TagType#isServerTag() server tags},
76 * each providing the segment's {@linkplain Segment#getBegin() begin} and {@linkplain Segment#getEnd() end} position in the source document.
77 * This allows an exact copy of the original document to be generated, allowing modifications to be made only where they are explicitly required.
78 * This is not possible with either <a target="_blank" href="http://en.wikipedia.org/wiki/Simple_API_for_XML">SAX</a> or
79 * <a target="_blank" href="http://en.wikipedia.org/wiki/StAX">StAX</a>, which to some extent provide interpretations of the content of the XML
80 * instead of the syntactial structures used in the original source document.
81 * </ul>
82 * <p>
83 * The following table summarises the differences between the <code>StreamedSource</code>, StAX and SAX interfaces.
84 * Note that some of the available features are documented as optional and may not be supported by all implementations of StAX and SAX.
85 * <p>
86 * <style type="text/css">
87 * table#ParserComparison td, table#ParserComparison th {padding: 0px 5px 0px 5px}
88 * .checkmark {text-align: center; font-size: 12pt}
89 * </style>
90 * <table id="ParserComparison" class="bordered" cellspacing="0">
91 * <tr><th class="LabelColumn">Feature</th><th><code>StreamedSource</code></th><th><a target="_blank" href="http://en.wikipedia.org/wiki/StAX">StAX</a></th><th><a target="_blank" href="http://en.wikipedia.org/wiki/Simple_API_for_XML">SAX</a></th></tr>
92 * <tr><td class="LabelColumn">Parse XML</td><td class="checkmark">&#9679;</td><td class="checkmark">&#9679;</td><td class="checkmark">&#9679;</td></tr>
93 * <tr><td class="LabelColumn">Parse entities without DTD</td><td class="checkmark">&#9679;</td><td class="checkmark"></td><td class="checkmark"></td></tr>
94 * <tr><td class="LabelColumn">Automatically validate XML</td><td class="checkmark"></td><td class="checkmark">&#9679;</td><td class="checkmark">&#9679;</td></tr>
95 * <tr><td class="LabelColumn">Parse HTML</td><td class="checkmark">&#9679;</td><td class="checkmark"></td><td class="checkmark"></td></tr>
96 * <tr><td class="LabelColumn">Tolerant of syntax or nesting errors</td><td class="checkmark">&#9679;</td><td class="checkmark"></td><td class="checkmark"></td></tr>
97 * <tr><td class="LabelColumn">Provide begin and end character positions of each event<sup>1</sup></td><td class="checkmark">&#9679;</td><td class="checkmark">&#9675;</td><td class="checkmark"></td></tr>
98 * <tr><td class="LabelColumn">Provide source text of each event</td><td class="checkmark">&#9679;</td><td class="checkmark"></td><td class="checkmark"></td></tr>
99 * <tr><td class="LabelColumn">Handle {@linkplain TagType#isServerTag() server tag} events</td><td class="checkmark">&#9679;</td><td class="checkmark"></td><td class="checkmark"></td></tr>
100 * <tr><td class="LabelColumn">Handle {@linkplain StartTagType#XML_DECLARATION XML declaration} event</td><td class="checkmark">&#9679;</td><td class="checkmark"></td><td class="checkmark"></td></tr>
101 * <tr><td class="LabelColumn">Handle {@linkplain StartTagType#COMMENT comment} events</td><td class="checkmark">&#9679;</td><td class="checkmark">&#9679;</td><td class="checkmark">&#9679;</td></tr>
102 * <tr><td class="LabelColumn">Handle {@linkplain StartTagType#CDATA_SECTION CDATA section} events</td><td class="checkmark">&#9679;</td><td class="checkmark">&#9679;</td><td class="checkmark">&#9679;</td></tr>
103 * <tr><td class="LabelColumn">Handle {@linkplain StartTagType#DOCTYPE_DECLARATION document type declaration} event</td><td class="checkmark">&#9679;</td><td class="checkmark">&#9679;</td><td class="checkmark">&#9679;</td></tr>
104 * <tr><td class="LabelColumn">Handle {@linkplain CharacterReference character reference} events</td><td class="checkmark">&#9679;</td><td class="checkmark"></td><td class="checkmark"></td></tr>
105 * <tr><td class="LabelColumn">Allow chunking of plain text</td><td class="checkmark">&#9679;</td><td class="checkmark">&#9679;</td><td class="checkmark">&#9679;</td></tr>
106 * <tr><td class="LabelColumn">Allow chunking of comment text</td><td class="checkmark"></td><td class="checkmark"></td><td class="checkmark"></td></tr>
107 * <tr><td class="LabelColumn">Allow chunking of CDATA section text</td><td class="checkmark"></td><td class="checkmark"></td><td class="checkmark">&#9679;</td></tr>
108 * <tr><td class="LabelColumn">Allow specification of maximum buffer size</td><td class="checkmark">&#9679;</td><td class="checkmark"></td><td class="checkmark"></td></tr>
109 * </table>
110 * <sup>1</sup> StAX optionally reports the "offset" of each event but this could be either byte or character position depending on the source.
111 * <p>
112 * Note that the {@link OutputDocument} class can not be used to create a modified version of a streamed source document.
113 * Instead, the output document must be constructed manually from the segments provided by the {@link #iterator() iterator}.
114 * <p>
115 * <code>StreamedSource</code> objects are not thread safe.
116 */
117 public final class StreamedSource implements Iterable<Segment>, Closeable {
118 private final StreamedText streamedText;
119 private final StreamedParseText streamedParseText;
120 private final Source source;
121 private final Closeable closeable; // internally created closeable object should be cleaned up internally.
122 private final boolean automaticClose;
123 private boolean coalescing=false;
124 private boolean handleTags=true;
125 private Config.UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettings=Config.CurrentCompatibilityMode.getUnterminatedCharacterReferenceSettings(false);
126 private boolean isInitialised=false;
127 private Segment currentSegment=null;
128 private Segment nextParsedSegment=START_SEGMENT;
129 private boolean isXML;
130
131 private static final boolean assumeNoNestedTags=false;
132 private static final Segment START_SEGMENT=new Segment(-1,-1);
133
134 private StreamedSource(final Reader reader, final boolean automaticClose, final String encoding, final String encodingSpecificationInfo, final String preliminaryEncodingInfo) throws IOException {
135 closeable=reader;
136 this.automaticClose=automaticClose;
137 streamedText=new StreamedText(reader);
138 streamedParseText=new StreamedParseText(streamedText);
139 source=new Source(streamedText,streamedParseText,encoding,encodingSpecificationInfo,preliminaryEncodingInfo);
140 }
141
142 private StreamedSource(final EncodingDetector encodingDetector, final boolean automaticClose) throws IOException {
143 this(encodingDetector.openReader(),automaticClose,encodingDetector.getEncoding(),encodingDetector.getEncodingSpecificationInfo(),encodingDetector.getPreliminaryEncoding()+": "+encodingDetector.getPreliminaryEncodingSpecificationInfo());
144 }
145
146 /**
147 * Constructs a new <code>StreamedSource</code> object by loading the content from the specified <code>Reader</code>.
148 * <p>
149 * If the specified reader is an instance of <code>InputStreamReader</code>, the {@link #getEncoding()} method of the
150 * created <code>StreamedSource</code> object returns the encoding from <code>InputStreamReader.getEncoding()</code>.
151 *
152 * @param reader the <code>java.io.Reader</code> from which to load the source text.
153 * @throws java.io.IOException if an I/O error occurs.
154 */
155 public StreamedSource(final Reader reader) throws IOException {
156 this(reader,false,(reader instanceof InputStreamReader) ? ((InputStreamReader)reader).getEncoding() : null,(reader instanceof InputStreamReader) ? "InputStreamReader.getEncoding() of constructor argument" : null,null);
157 }
158
159 /**
160 * Constructs a new <code>StreamedSource</code> object by loading the content from the specified <code>InputStream</code>.
161 * <p>
162 * The algorithm for detecting the character {@linkplain #getEncoding() encoding} of the source document from the raw bytes
163 * of the specified input stream is the same as that for the {@link Source#Source(URLConnection) Source(URLConnection)} constructor of the {@link Source} class,
164 * except that the first step is not possible as there is no
165 * <a target="_blank" href="http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.17">Content-Type</a> header to check.
166 * <p>
167 * If the specified <code>InputStream</code> does not support the <code>mark</code> method, the algorithm that determines the encoding may have to wrap it
168 * in a <code>BufferedInputStream</code> in order to look ahead at the encoding meta data.
169 * This extra layer of buffering will then remain in place for the life of the <code>StreamedSource</code>, possibly impacting memory usage and/or degrading performance.
170 * It is always preferable to use the {@link #StreamedSource(Reader)} constructor if the encoding is known in advance.
171 *
172 * @param inputStream the <code>java.io.InputStream</code> from which to load the source text.
173 * @throws java.io.IOException if an I/O error occurs.
174 * @see #getEncoding()
175 */
176 public StreamedSource(final InputStream inputStream) throws IOException {
177 this(new EncodingDetector(inputStream),false);
178 }
179
180 /**
181 * Constructs a new <code>StreamedSource</code> object by loading the content from the specified URL.
182 * <p>
183 * This is equivalent to {@link #StreamedSource(URLConnection) StreamedSource(url.openConnection())}.
184 *
185 * @param url the URL from which to load the source text.
186 * @throws java.io.IOException if an I/O error occurs.
187 * @see #getEncoding()
188 */
189 public StreamedSource(final URL url) throws IOException {
190 this(new EncodingDetector(url.openConnection()),true);
191 }
192
193 /**
194 * Constructs a new <code>StreamedSource</code> object by loading the content from the specified <code>URLConnection</code>.
195 * <p>
196 * The algorithm for detecting the character {@linkplain #getEncoding() encoding} of the source document is identical to that described in the
197 * {@link Source#Source(URLConnection) Source(URLConnection)} constructor of the {@link Source} class.
198 * <p>
199 * The algorithm that determines the encoding may have to wrap the input stream in a <code>BufferedInputStream</code> in order to look ahead
200 * at the encoding meta data if the encoding is not specified in the HTTP headers.
201 * This extra layer of buffering will then remain in place for the life of the <code>StreamedSource</code>, possibly impacting memory usage and/or degrading performance.
202 * It is always preferable to use the {@link #StreamedSource(Reader)} constructor if the encoding is known in advance.
203 *
204 * @param urlConnection the URL connection from which to load the source text.
205 * @throws java.io.IOException if an I/O error occurs.
206 * @see #getEncoding()
207 */
208 public StreamedSource(final URLConnection urlConnection) throws IOException {
209 this(new EncodingDetector(urlConnection),true);
210 }
211
212 /**
213 * Constructs a new <code>StreamedSource</code> object from the specified text.
214 * <p>
215 * Although the <code>CharSequence</code> argument of this constructor apparently contradicts the notion of streaming in the source text,
216 * it can still benefits over the equivalent use of the standard {@link Source} class.
217 * <p>
218 * Firstly, using the <code>StreamedSource</code> class to iterate the nodes of an in-memory <code>CharSequence</code> source document still requires much less memory
219 * than the equivalent operation using the standard {@link Source} class.
220 * <p>
221 * Secondly, the specified <code>CharSequence</code> object could possibly implement its own paging mechanism to minimise memory usage.
222 * <p>
223 * If the specified <code>CharSequence</code> is mutable, its state must not be modified while the <code>StreamedSource</code> is in use.
224 *
225 * @param text the source text.
226 */
227 public StreamedSource(final CharSequence text) {
228 closeable=null;
229 automaticClose=false;
230 streamedText=new StreamedText(text);
231 streamedParseText=new StreamedParseText(streamedText);
232 source=new Source(text,streamedParseText,null,"Document specified encoding can not be determined automatically from a streamed source",null);
233 }
234
235 /**
236 * Specifies an existing character array to use for buffering the incoming character stream.
237 * <p>
238 * The specified buffer is fixed for the life of the <code>StreamedSource</code> object, in contrast to the default buffer which can be automatically replaced
239 * by a larger buffer as needed.
240 * This means that if a {@linkplain Tag tag} (including a {@linkplain StartTagType#COMMENT comment} or {@linkplain StartTagType#CDATA_SECTION CDATA section}) is
241 * encountered that is larger than the specified buffer, an unrecoverable <code>BufferOverflowException</code> is thrown.
242 * This exception is also thrown if {@link #setCoalescing(boolean) coalescing} has been enabled and a plain text segment is encountered
243 * that is larger than the specified buffer.
244 * <p>
245 * In general this method should only be used if there needs to be an absolute maximum memory limit imposed on the parser, where that requirement is more important
246 * than the ability to parse any source document successfully.
247 * <p>
248 * This method can only be called before the {@link #iterator()} method has been called.
249 *
250 * @param buffer an existing character array to use for buffering the incoming character stream, must not be <code>null</code>.
251 * @return this <code>StreamedSource</code> instance, allowing multiple property setting methods to be chained in a single statement.
252 * @throws IllegalStateException if the {@link #iterator()} method has already been called.
253 */
254 public StreamedSource setBuffer(char[] buffer) {
255 if (isInitialised) throw new IllegalStateException("setBuffer() can only be called before iterator() is called");
256 streamedText.setBuffer(buffer);
257 return this;
258 }
259
260 /**
261 * Specifies whether an unbroken section of <a href="#PlainText">plain text</a> in the source document should always be coalesced into a single {@link Segment} by the {@linkplain #iterator() iterator}.
262 * <p>
263 * If this property is set to the <b>default</b> value of <code>false</code>,
264 * and a section of plain text is encountered in the document that is larger than the current {@linkplain #getBufferSize() buffer size},
265 * the text is <i>chunked</i> into multiple consecutive plain text segments in order to minimise memory usage.
266 * <p>
267 * If this property is set to <code>true</code> then chunking is disabled, ensuring that consecutive plain text segments are never generated,
268 * but instead forcing the internal buffer to expand to fit the largest section of plain text.
269 * <p>
270 * Note that {@link CharacterReference} segments are always handled separately from plain text, regardless of the value of this property.
271 * For this reason, algorithms that process element content almost always have to be designed to expect the text in multiple segments
272 * in order to handle character references, so there is usually no advantage in {@linkplain #setCoalescing(boolean) coalescing} plain text segments.
273 *
274 * @param coalescing the new value of the coalescing property.
275 * @return this <code>StreamedSource</code> instance, allowing multiple property setting methods to be chained in a single statement.
276 * @throws IllegalStateException if the {@link #iterator()} method has already been called.
277 */
278 public StreamedSource setCoalescing(final boolean coalescing) {
279 if (isInitialised) throw new IllegalStateException("setPlainTextWriter() can only be called before iterator() is called");
280 this.coalescing=coalescing;
281 return this;
282 }
283
284 /**
285 * Closes the underlying <code>Reader</code> or <code>InputStream</code> and releases any system resources associated with it.
286 * <p>
287 * If the stream is already closed then invoking this method has no effect.
288 *
289 * @throws IOException if an I/O error occurs.
290 */
291 public void close() throws IOException {
292 if (closeable!=null) closeable.close();
293 }
294
295 /**
296 * Returns the character encoding scheme of the source byte stream used to create this object.
297 * <p>
298 * This method works in essentially the same way as the {@link Source#getEncoding()} method.
299 * <p>
300 * If the byte stream used to create this object does not support the <code>mark</code> method, the algorithm that determines the encoding may have to wrap it
301 * in a <code>BufferedInputStream</code> in order to look ahead at the encoding meta data.
302 * This extra layer of buffering will then remain in place for the life of the <code>StreamedSource</code>, possibly impacting memory usage and/or degrading performance.
303 * It is always preferable to use the {@link #StreamedSource(Reader)} constructor if the encoding is known in advance.
304 * <p>
305 * The {@link #getEncodingSpecificationInfo()} method returns a simple description of how the value of this method was determined.
306 *
307 * @return the character encoding scheme of the source byte stream used to create this object, or <code>null</code> if the encoding is not known.
308 * @see #getEncodingSpecificationInfo()
309 */
310 public String getEncoding() {
311 return source.getEncoding();
312 }
313
314 /**
315 * Returns a concise description of how the {@linkplain #getEncoding() encoding} of the source document was determined.
316 * <p>
317 * The description is intended for informational purposes only.
318 * It is not guaranteed to have any particular format and can not be reliably parsed.
319 *
320 * @return a concise description of how the {@linkplain #getEncoding() encoding} of the source document was determined.
321 * @see #getEncoding()
322 */
323 public String getEncodingSpecificationInfo() {
324 return source.getEncodingSpecificationInfo();
325 }
326
327 /**
328 * Returns the preliminary encoding of the source document together with a concise description of how it was determined.
329 * <p>
330 * This method works in essentially the same way as the {@link Source#getPreliminaryEncodingInfo()} method.
331 * <p>
332 * The description returned by this method is intended for informational purposes only.
333 * It is not guaranteed to have any particular format and can not be reliably parsed.
334 *
335 * @return the preliminary encoding of the source document together with a concise description of how it was determined, or <code>null</code> if no preliminary encoding was required.
336 * @see #getEncoding()
337 */
338 public String getPreliminaryEncodingInfo() {
339 return source.getPreliminaryEncodingInfo();
340 }
341
342 /**
343 * Returns an iterator over every {@linkplain Tag tag}, {@linkplain CharacterReference character reference} and plain text segment contained within the source document.
344 * <p>
345 * <i><a name="PlainText">Plain text</a></i> is defined as all text that is not part of a {@link Tag} or {@link CharacterReference}.
346 * <p>
347 * This results in a sequential walk-through of the entire source document.
348 * The {@linkplain Segment#getEnd() end} position of each segment should correspond with the {@linkplain Segment#getBegin() begin} position of the subsequent segment,
349 * unless any of the tags are enclosed by other tags.
350 * This could happen if there are {@linkplain TagType#isServerTag() server tags} present in the document, or in rare circumstances where the
351 * {@linkplain StartTagType#DOCTYPE_DECLARATION document type declaration} contains {@linkplain StartTagType#MARKUP_DECLARATION markup declarations}.
352 * <p>
353 * Each segment generated by the iterator is parsed as the source text is streamed in. Previous segments are discarded for garbage collection.
354 * <p>
355 * If a section of plain text is encountered in the document that is larger than the current {@linkplain #getBufferSize() buffer size},
356 * the text is <i>chunked</i> into multiple consecutive plain text segments in order to minimise memory usage.
357 * Setting the {@link #setCoalescing(boolean) Coalescing} property to <code>true</code> disables chunking, ensuring that consecutive plain text segments are never generated,
358 * but instead forcing the internal buffer to expand to fit the largest section of plain text.
359 * Note that {@link CharacterReference} segments are always handled separately from plain text, regardless of whether {@linkplain #setCoalescing(boolean) coalescing}
360 * is enabled. For this reason, algorithms that process element content almost always have to be designed to expect the text in multiple segments
361 * in order to handle character references, so there is usually no advantage in {@linkplain #setCoalescing(boolean) coalescing} plain text segments.
362 * <p>
363 * Character references that are found inside tags, such as those present inside attribute values, do not generate separate segments from the iterator.
364 * <p>
365 * This method may only be called once on any particular <code>StreamedSource</code> instance.
366 * <p>
367 * <dl>
368 * <dt>Example:</dt>
369 * <dd>
370 * <p>
371 * The following code demonstrates the typical (implied) usage of this method through the <code>Iterable</code> interface
372 * to make an exact copy of the document from <code>reader</code> to <code>writer</code> (assuming no server tags are present):
373 * </p>
374 * <pre>
375 * StreamedSource streamedSource=new StreamedSource(reader);
376 * for (Segment segment : streamedSource) {
377 * if (segment instanceof Tag) {
378 * Tag tag=(Tag)segment;
379 * // HANDLE TAG
380 * // Uncomment the following line to ensure each tag is valid XML:
381 * // writer.write(tag.tidy()); continue;
382 * } else if (segment instanceof CharacterReference) {
383 * CharacterReference characterReference=(CharacterReference)segment;
384 * // HANDLE CHARACTER REFERENCE
385 * // Uncomment the following line to decode all character references instead of copying them verbatim:
386 * // characterReference.appendCharTo(writer); continue;
387 * } else {
388 * // HANDLE PLAIN TEXT
389 * }
390 * // unless specific handling has prevented getting to here, simply output the segment as is:
391 * writer.write(segment.toString());
392 * }</pre>
393 * <p>Note that the last line <code>writer.write(segment.toString())</code> in the above code can be replaced with the following for improved performance:</p>
394 * <pre>
395 * CharBuffer charBuffer=streamedSource.getCurrentSegmentCharBuffer();
396 * writer.write(charBuffer.array(),charBuffer.position(),charBuffer.length());</pre>
397 * </dd>
398 * <dd>
399 * <p>
400 * The following code demonstrates how to process the plain text content of a specific element, in this case to print the content of every paragraph element:
401 * </p>
402 * <pre>
403 * StreamedSource streamedSource=new StreamedSource(reader);
404 * StringBuilder sb=new StringBuilder();
405 * boolean insideParagraphElement=false;
406 * for (Segment segment : streamedSource) {
407 * if (segment instanceof Tag) {
408 * Tag tag=(Tag)segment;
409 * if (tag.getName().equals("p")) {
410 * if (tag instanceof StartTag) {
411 * insideParagraphElement=true;
412 * sb.setLength(0);
413 * } else { // tag instanceof EndTag
414 * insideParagraphElement=false;
415 * System.out.println(sb.toString());
416 * }
417 * }
418 * } else if (insideParagraphElement) {
419 * if (segment instanceof CharacterReference) {
420 * ((CharacterReference)segment).appendCharTo(sb);
421 * } else {
422 * sb.append(segment);
423 * }
424 * }
425 * }</pre>
426 * </dd>
427 * </dl>
428 * @return an iterator over every {@linkplain Tag tag}, {@linkplain CharacterReference character reference} and plain text segment contained within the source document.
429 */
430 public Iterator<Segment> iterator() {
431 if (isInitialised) throw new IllegalStateException("iterator() can only be called once");
432 isInitialised=true;
433 return new StreamedSourceIterator();
434 }
435
436 /**
437 * Returns the current {@link Segment} from the {@linkplain #iterator()}.
438 * <p>
439 * This is defined as the last {@link Segment} returned from the iterator's <code>next()</code> method.
440 * <p>
441 * This method returns <code>null</code> if the iterator's <code>next()</code> method has never been called, or its
442 * <code>hasNext()</code> method has returned the value <code>false</code>.
443 *
444 * @return the current {@link Segment} from the {@linkplain #iterator()}.
445 */
446 public Segment getCurrentSegment() {
447 return currentSegment;
448 }
449
450 /**
451 * Returns a <code>CharBuffer</code> containing the source text of the {@linkplain #getCurrentSegment() current segment}.
452 * <p>
453 * The returned <code>CharBuffer</code> provides a window into the internal <code>char[]</code> buffer including the position and length that spans the
454 * {@linkplain #getCurrentSegment() current segment}.
455 * <p>
456 * For example, the following code writes the source text of the current segment to <code>writer</code>:
457 * <p>
458 * <code>CharBuffer charBuffer=streamedSource.getCurrentSegmentCharBuffer();</code><br />
459 * <code>writer.write(charBuffer.array(),charBuffer.position(),charBuffer.length());</code>
460 * <p>
461 * This may provide a performance benefit over the standard way of accessing the source text of the current segment,
462 * which is to use the <code>CharSequence</code> interface of the segment directly, or to call {@link Segment#toString()}.
463 * <p>
464 * Because this <code>CharBuffer</code> is a direct window into the internal buffer of the <code>StreamedSource</code>, the contents of the
465 * <code>CharBuffer.array()</code> must not be modified, and the array is only guaranteed to hold the segment source text until the
466 * iterator's <code>hasNext()</code> or <code>next()</code> method is next called.
467 *
468 * @return a <code>CharBuffer</code> containing the source text of the {@linkplain #getCurrentSegment() current segment}.
469 */
470 public CharBuffer getCurrentSegmentCharBuffer() {
471 return streamedText.getCharBuffer(currentSegment.getBegin(),currentSegment.end);
472 }
473
474 /**
475 * Indicates whether the source document is likely to be <a target="_blank" href="http://www.w3.org/TR/REC-xml/">XML</a>.
476 * <p>
477 * The algorithm used to determine this is designed to be relatively inexpensive and to provide an accurate result in
478 * most normal situations.
479 * An exact determination of whether the source document is XML would require a much more complex analysis of the text.
480 * <p>
481 * The algorithm is as follows:
482 * <ol class="HalfSeparated">
483 * <li>If the document begins with an {@linkplain StartTagType#XML_DECLARATION XML declaration}, it is an XML document.
484 * <li>If the document begins with a {@linkplain StartTagType#DOCTYPE_DECLARATION document type declaration} that contains the text
485 * "<code>xhtml</code>", it is an <a target="_blank" href="http://www.w3.org/TR/xhtml1/">XHTML</a> document, and hence
486 * also an XML document.
487 * <li>If none of the above conditions are met, assume the document is normal HTML, and therefore not an XML document.
488 * </ol>
489 * <p>
490 * This method can only be called after the {@link #iterator()} method has been called.
491 *
492 * @return <code>true</code> if the source document is likely to be <a target="_blank" href="http://www.w3.org/TR/REC-xml/">XML</a>, otherwise <code>false</code>.
493 * @throws IllegalStateException if the {@link #iterator()} method has not yet been called.
494 */
495 public boolean isXML() {
496 if (!isInitialised) throw new IllegalStateException("isXML() method only available after iterator() has been called");
497 return isXML;
498 }
499
500 /**
501 * Sets the {@link Logger} that handles log messages.
502 * <p>
503 * Specifying a <code>null</code> argument disables logging completely for operations performed on this <code>StreamedSource</code> object.
504 * <p>
505 * A logger instance is created automatically for each <code>StreamedSource</code> object in the same way as is described in the
506 * {@link Source#setLogger(Logger)} method.
507 *
508 * @param logger the logger that will handle log messages, or <code>null</code> to disable logging.
509 * @see Config#LoggerProvider
510 */
511 public void setLogger(final Logger logger) {
512 source.setLogger(logger);
513 }
514
515 /**
516 * Returns the {@link Logger} that handles log messages.
517 * <p>
518 * A logger instance is created automatically for each <code>StreamedSource</code> object using the {@link LoggerProvider}
519 * specified by the static {@link Config#LoggerProvider} property.
520 * This can be overridden by calling the {@link #setLogger(Logger)} method.
521 * The name used for all automatically created logger instances is "<code>net.htmlparser.jericho</code>".
522 *
523 * @return the {@link Logger} that handles log messages, or <code>null</code> if logging is disabled.
524 */
525 public Logger getLogger() {
526 return source.getLogger();
527 }
528
529 /**
530 * Returns the current size of the internal character buffer.
531 * <p>
532 * This information is generally useful only for investigating memory and performance issues.
533 *
534 * @return the current size of the internal character buffer.
535 */
536 public int getBufferSize() {
537 return streamedText.getBuffer().length;
538 }
539
540 /**
541 * Returns a string representation of the object as generated by the default <code>Object.toString()</code> implementation.
542 * <p>
543 * In contrast to the {@link Source#toString()} implementation, it is generally not possible for this method to return the entire source text.
544 *
545 * @return a string representation of the object as generated by the default <code>Object.toString()</code> implementation.
546 */
547 public String toString() {
548 return super.toString();
549 }
550
551 /**
552 * Called by the garbage collector on an object when garbage collection determines that there are no more references to the object.
553 * <p>
554 * This implementation calls the {@link #close()} method if the underlying <code>Reader</code> or <code>InputStream</code> stream was created internally.
555 */
556 protected void finalize() {
557 automaticClose();
558 }
559
560 StreamedSource setHandleTags(final boolean handleTags) {
561 this.handleTags=handleTags;
562 return this;
563 }
564
565 StreamedSource setUnterminatedCharacterReferenceSettings(final Config.UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettings) {
566 this.unterminatedCharacterReferenceSettings=unterminatedCharacterReferenceSettings;
567 return this;
568 }
569
570 StreamedSource setSearchBegin(final int begin) {
571 if (isInitialised) throw new IllegalStateException("setSearchBegin() can only be called before iterator() is called");
572 final int segmentEnd=begin-1;
573 nextParsedSegment=new Segment(segmentEnd,segmentEnd);
574 return this;
575 }
576
577 private void automaticClose() {
578 if (automaticClose) try {close();} catch (IOException ex) {}
579 }
580
581 private static boolean isXML(final Segment firstNonTextSegment) {
582 if (firstNonTextSegment==null || !(firstNonTextSegment instanceof Tag)) return false;
583 Tag tag=(Tag)firstNonTextSegment;
584 if (tag.getTagType()==StartTagType.XML_DECLARATION) return true;
585 // if document has a DOCTYPE declaration and it contains the text "xhtml", it is an XML document:
586 if (tag.source.getParseText().indexOf("xhtml",tag.begin,tag.end)!=-1) return true;
587 return false;
588 }
589
590 private class StreamedSourceIterator implements Iterator<Segment> {
591 private final boolean coalescing;
592 private final boolean handleTags;
593 private Config.UnterminatedCharacterReferenceSettings unterminatedCharacterReferenceSettings;
594 private Segment nextSegment;
595 private int plainTextSegmentBegin=0;
596 private final char[] charByRef=new char[1]; // used to pass a single character by reference
597
598 public StreamedSourceIterator() {
599 coalescing=StreamedSource.this.coalescing;
600 handleTags=StreamedSource.this.handleTags;
601 unterminatedCharacterReferenceSettings=StreamedSource.this.unterminatedCharacterReferenceSettings;
602 loadNextParsedSegment();
603 isXML=isXML(nextParsedSegment);
604 }
605
606 public boolean hasNext() {
607 if (nextSegment==Tag.NOT_CACHED) loadNextParsedSegment();
608 return nextSegment!=null;
609 }
610
611 public Segment next() {
612 if (!hasNext()) throw new NoSuchElementException();
613 final Segment result=nextSegment;
614 nextSegment=(result==nextParsedSegment) ? Tag.NOT_CACHED : nextParsedSegment;
615 streamedText.setMinRequiredBufferBegin(result.end); // guaranteed not to be discarded until next call to loadNextParsedSegment()
616 currentSegment=result;
617 return result;
618 }
619
620 public void remove() {
621 throw new UnsupportedOperationException();
622 }
623
624 private final void loadNextParsedSegment() {
625 nextParsedSegment=findNextParsedSegment();
626 final int plainTextSegmentEnd=(nextParsedSegment!=null) ? nextParsedSegment.begin : streamedText.length();
627 nextSegment=(plainTextSegmentBegin<plainTextSegmentEnd) ? new Segment(source,plainTextSegmentBegin,plainTextSegmentEnd) : nextParsedSegment;
628 if (nextParsedSegment!=null && plainTextSegmentBegin<nextParsedSegment.end) plainTextSegmentBegin=nextParsedSegment.end;
629 }
630
631 private final Segment findNextParsedSegment() {
632 try {
633 int i=(nextParsedSegment instanceof StartTag && ((StartTag)nextParsedSegment).getTagType()==StartTagType.SERVER_COMMON_COMMENT)
634 ? nextParsedSegment.getEnd()
635 : nextParsedSegment.getBegin()+1;
636 final int searchEnd=coalescing ? streamedText.getEnd() : streamedText.getBufferOverflowPosition();
637 while (i<searchEnd) {
638 final char ch=streamedText.charAt(i);
639 if (ch=='&') {
640 if (i>=source.fullSequentialParseData[0]) { // do not handle character references inside tags or script elements
641 final CharacterReference characterReference=CharacterReference.construct(source,i,unterminatedCharacterReferenceSettings);
642 if (characterReference!=null) return characterReference;
643 }
644 } else if (handleTags && ch=='<') {
645 final Tag tag=TagType.getTagAt(source,i,false,assumeNoNestedTags);
646 if (tag!=null && !tag.isUnregistered()) {
647 final TagType tagType=tag.getTagType();
648 if (tag.end>source.fullSequentialParseData[0] && tagType!=StartTagType.DOCTYPE_DECLARATION) {
649 source.fullSequentialParseData[0]=(tagType==StartTagType.NORMAL && tag.name==HTMLElementName.SCRIPT && !((StartTag)tag).isEmptyElementTag()) ? Integer.MAX_VALUE : tag.end;
650 }
651 return tag;
652 }
653 }
654 i++;
655 }
656 if (i<streamedText.getEnd()) {
657 // not coalescing, reached buffer overflow position
658 return new Segment(source,plainTextSegmentBegin,i);
659 }
660 } catch (BufferOverflowException ex) {
661 // Unrecoverable buffer overflow - close the reader if it was created internally:
662 automaticClose();
663 throw ex;
664 } catch (IndexOutOfBoundsException ex) {
665 // normal way to catch end of stream.
666 }
667 // streamedText.length() is now guaranteed to return document length
668 // End of stream has been reached, can close the reader if it was created internally:
669 automaticClose();
670 return null;
671 }
672 }
673 }

   
Visit the aagtl Website