/[aagtl_public1]/src/net/htmlparser/jericho/OutputDocument.java
aagtl

Contents of /src/net/htmlparser/jericho/OutputDocument.java

Parent Directory Parent Directory | Revision Log Revision Log


Revision 2 - (show annotations) (download)
Sun Aug 5 13:48:36 2012 UTC (11 years, 7 months ago) by zoffadmin
File size: 23399 byte(s)
initial import of aagtl source code
1 // Jericho HTML Parser - Java based library for analysing and manipulating HTML
2 // Version 3.2
3 // Copyright (C) 2004-2009 Martin Jericho
4 // http://jericho.htmlparser.net/
5 //
6 // This library is free software; you can redistribute it and/or
7 // modify it under the terms of either one of the following licences:
8 //
9 // 1. The Eclipse Public License (EPL) version 1.0,
10 // included in this distribution in the file licence-epl-1.0.html
11 // or available at http://www.eclipse.org/legal/epl-v10.html
12 //
13 // 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
14 // included in this distribution in the file licence-lgpl-2.1.txt
15 // or available at http://www.gnu.org/licenses/lgpl.txt
16 //
17 // This library is distributed on an "AS IS" basis,
18 // WITHOUT WARRANTY OF ANY KIND, either express or implied.
19 // See the individual licence texts for more details.
20
21 package net.htmlparser.jericho;
22
23 import java.io.*;
24 import java.util.*;
25
26 /**
27 * Represents a modified version of an original {@link Source} document or {@link Segment}.
28 * <p>
29 * An <code>OutputDocument</code> represents an original {@link Source} document or {@link Segment} that
30 * has been modified by substituting segments of it with other text.
31 * Each of these substitutions must be registered in the output document,
32 * which is most commonly done using the various <code>replace</code>, <code>remove</code> or <code>insert</code> methods in this class.
33 * These methods internally {@linkplain #register(OutputSegment) register} one or more {@link OutputSegment} objects to define each substitution.
34 * <p>
35 * If a {@link Segment} is used to construct the output document, all character positions are relative to the source document of the specified segment.
36 * <p>
37 * After all of the substitutions have been registered, the modified text can be retrieved using the
38 * {@link #writeTo(Writer)} or {@link #toString()} methods.
39 * <p>
40 * The registered {@linkplain OutputSegment output segments} may be adjacent and may also overlap.
41 * An output segment that is completely enclosed by another output segment is not included in the output.
42 * <p>
43 * If unexpected results are being generated from an <code>OutputDocument</code>, the {@link #getDebugInfo()} method provides information on each
44 * {@linkplain #getRegisteredOutputSegments() registered output segment}, which should provide enough information to determine the cause of the problem.
45 * In most cases the problem will be caused by overlapping output segments.
46 * <p>
47 * The following example converts all externally referenced style sheets to internal style sheets:
48 * <p>
49 * <pre>
50 * URL sourceUrl=new URL(sourceUrlString);
51 * String htmlText=Util.getString(new InputStreamReader(sourceUrl.openStream()));
52 * Source source=new Source(htmlText);
53 * OutputDocument outputDocument=new OutputDocument(source);
54 * StringBuilder sb=new StringBuilder();
55 * List linkStartTags=source.getAllStartTags(HTMLElementName.LINK);
56 * for (Iterator i=linkStartTags.iterator(); i.hasNext();) {
57 * StartTag startTag=(StartTag)i.next();
58 * Attributes attributes=startTag.getAttributes();
59 * String rel=attributes.getValue("rel");
60 * if (!"stylesheet".equalsIgnoreCase(rel)) continue;
61 * String href=attributes.getValue("href");
62 * if (href==null) continue;
63 * String styleSheetContent;
64 * try {
65 * styleSheetContent=Util.getString(new InputStreamReader(new URL(sourceUrl,href).openStream()));
66 * } catch (Exception ex) {
67 * continue; // don't convert if URL is invalid
68 * }
69 * sb.setLength(0);
70 * sb.append("&lt;style");
71 * Attribute typeAttribute=attributes.get("type");
72 * if (typeAttribute!=null) sb.append(' ').append(typeAttribute);
73 * sb.append("&gt;\n").append(styleSheetContent).append("\n&lt;/style&gt;");
74 * outputDocument.replace(startTag,sb);
75 * }
76 * String convertedHtmlText=outputDocument.toString();
77 * </pre>
78 *
79 * @see OutputSegment
80 */
81 public final class OutputDocument implements CharStreamSource {
82 private CharSequence sourceText;
83 private ArrayList<OutputSegment> outputSegments=new ArrayList<OutputSegment>();
84
85 /**
86 * Constructs a new output document based on the specified source document.
87 * @param source the source document.
88 */
89 public OutputDocument(final Source source) {
90 if (source==null) throw new IllegalArgumentException("source argument must not be null");
91 this.sourceText=source;
92 }
93
94 /**
95 * Constructs a new output document based on the specified {@link Segment}.
96 * @param segment the original {@link Segment}.
97 */
98 public OutputDocument(final Segment segment) {
99 if (segment==null) throw new IllegalArgumentException("segment argument must not be null");
100 Source source=segment.source;
101 this.sourceText=source;
102 if (segment.begin>0) remove(new Segment(source,0,segment.begin));
103 if (segment.end<source.end) remove(new Segment(source,segment.end,source.end));
104 }
105
106 OutputDocument(final ParseText parseText) {
107 this.sourceText=parseText;
108 }
109
110 /**
111 * Returns the original source text upon which this output document is based.
112 * <p>
113 * If a {@link Segment} was used to construct the output document, this returns the text of the entire source document rather than just the segment.
114 *
115 * @return the original source text upon which this output document is based.
116 */
117 public CharSequence getSourceText() {
118 return sourceText;
119 }
120
121 /**
122 * Removes the specified {@linkplain Segment segment} from this output document.
123 * <p>
124 * This is equivalent to {@link #replace(Segment,CharSequence) replace}<code>(segment,null)</code>.
125 *
126 * @param segment the segment to remove.
127 */
128 public void remove(final Segment segment) {
129 register(new RemoveOutputSegment(segment));
130 }
131
132 /**
133 * Removes all the segments from this output document represented by the specified source {@linkplain Segment} objects.
134 * <p>
135 * This is equivalent to the following code:<pre>
136 * for (Iterator i=segments.iterator(); i.hasNext();)
137 * {@link #remove(Segment) remove}((Segment)i.next());</pre>
138 *
139 * @param segments a collection of segments to remove, represented by source {@link Segment} objects.
140 */
141 public void remove(final Collection<? extends Segment> segments) {
142 for (Segment segment : segments) remove(segment);
143 }
144
145 /**
146 * Inserts the specified text at the specified character position in this output document.
147 * @param pos the character position at which to insert the text.
148 * @param text the replacement text.
149 */
150 public void insert(final int pos, final CharSequence text) {
151 register(new StringOutputSegment(pos,pos,text));
152 }
153
154 /**
155 * Replaces the specified {@linkplain Segment segment} in this output document with the specified text.
156 * <p>
157 * Specifying a <code>null</code> argument to the <code>text</code> parameter is exactly equivalent to specifying an empty string,
158 * and results in the segment being completely removed from the output document.
159 *
160 * @param segment the segment to replace.
161 * @param text the replacement text, or <code>null</code> to remove the segment.
162 */
163 public void replace(final Segment segment, final CharSequence text) {
164 replace(segment.getBegin(),segment.getEnd(),text);
165 }
166
167 /**
168 * Replaces the specified segment of this output document with the specified text.
169 * <p>
170 * Specifying a <code>null</code> argument to the <code>text</code> parameter is exactly equivalent to specifying an empty string,
171 * and results in the segment being completely removed from the output document.
172 *
173 * @param begin the character position at which to begin the replacement.
174 * @param end the character position at which to end the replacement.
175 * @param text the replacement text, or <code>null</code> to remove the segment.
176 */
177 public void replace(final int begin, final int end, final CharSequence text) {
178 register(new StringOutputSegment(begin,end,text));
179 }
180
181 /**
182 * Replaces the specified segment of this output document with the specified character.
183 *
184 * @param begin the character position at which to begin the replacement.
185 * @param end the character position at which to end the replacement.
186 * @param ch the replacement character.
187 */
188 public void replace(final int begin, final int end, final char ch) {
189 register(new CharOutputSegment(begin,end,ch));
190 }
191
192 /**
193 * Replaces the specified {@link FormControl} in this output document.
194 * <p>
195 * The effect of this method is to {@linkplain #register(OutputSegment) register} zero or more
196 * {@linkplain OutputSegment output segments} in the output document as required to reflect
197 * previous modifications to the control's state.
198 * The state of a control includes its <a href="FormControl.html#SubmissionValue">submission value</a>,
199 * {@linkplain FormControl#setOutputStyle(FormControlOutputStyle) output style}, and whether it has been
200 * {@linkplain FormControl#setDisabled(boolean) disabled}.
201 * <p>
202 * The state of the form control should not be modified after this method is called, as there is no guarantee that
203 * subsequent changes either will or will not be reflected in the final output.
204 * A second call to this method with the same parameter is not allowed.
205 * It is therefore recommended to call this method as the last action before the output is generated.
206 * <p>
207 * Although the specifics of the number and nature of the output segments added in any particular circumstance
208 * is not defined in the specification, it can generally be assumed that only the minimum changes necessary
209 * are made to the original document. If the state of the control has not been modified, calling this method
210 * has no effect at all.
211 *
212 * @param formControl the form control to replace.
213 * @see #replace(FormFields)
214 */
215 public void replace(final FormControl formControl) {
216 formControl.replaceInOutputDocument(this);
217 }
218
219 /**
220 * {@linkplain #replace(FormControl) Replaces} all the constituent {@linkplain FormControl form controls}
221 * from the specified {@link FormFields} in this output document.
222 * <p>
223 * This is equivalent to the following code:
224 * <pre>for (Iterator i=formFields.{@link FormFields#getFormControls() getFormControls()}.iterator(); i.hasNext();)
225 * {@link #replace(FormControl) replace}((FormControl)i.next());</pre>
226 * <p>
227 * The state of any of the form controls in the specified form fields should not be modified after this method is called,
228 * as there is no guarantee that subsequent changes either will or will not be reflected in the final output.
229 * A second call to this method with the same parameter is not allowed.
230 * It is therefore recommended to call this method as the last action before the output is generated.
231 *
232 * @param formFields the form fields to replace.
233 * @see #replace(FormControl)
234 */
235 public void replace(final FormFields formFields) {
236 formFields.replaceInOutputDocument(this);
237 }
238
239 /**
240 * Replaces the specified {@link Attributes} segment in this output document with the name/value entries
241 * in the returned <code>Map</code>.
242 * The returned map initially contains entries representing the attributes from the source document,
243 * which can be modified before output.
244 * <p>
245 * The documentation of the {@link #replace(Attributes,Map)} method contains more information about the requirements
246 * of the map entries.
247 * <p>
248 * Specifying a value of <code>true</code> as an argument to the <code>convertNamesToLowerCase</code> parameter
249 * causes all original attribute names to be converted to lower case in the map.
250 * This simplifies the process of finding/updating specific attributes since map keys are case sensitive.
251 * <p>
252 * Attribute values are automatically {@linkplain CharacterReference#decode(CharSequence) decoded} before
253 * being loaded into the map.
254 * <p>
255 * This method is logically equivalent to:<br />
256 * {@link #replace(Attributes,Map) replace}<code>(attributes, attributes.</code>{@link Attributes#populateMap(Map,boolean) populateMap(new LinkedHashMap&lt;String,String&gt;(),convertNamesToLowerCase)}<code>)</code>
257 * <p>
258 * The use of <code>LinkedHashMap</code> to implement the map ensures (probably unnecessarily) that
259 * existing attributes are output in the same order as they appear in the source document, and new
260 * attributes are output in the same order as they are added.
261 * <p>
262 * <dl>
263 * <dt>Example:</dt>
264 * <dd><pre>
265 * Source source=new Source(htmlDocument);
266 * Attributes bodyAttributes
267 * =source.getNextStartTag(0,HTMLElementName.BODY).getAttributes();
268 * OutputDocument outputDocument=new OutputDocument(source);
269 * Map&lt;String,String&gt; attributesMap=outputDocument.replace(bodyAttributes,true);
270 * attributesMap.put("bgcolor","green");
271 * String htmlDocumentWithGreenBackground=outputDocument.toString();</pre></dl>
272 *
273 * @param attributes the <code>Attributes</code> segment defining the span of the segment and initial name/value entries of the returned map.
274 * @param convertNamesToLowerCase specifies whether all attribute names are converted to lower case in the map.
275 * @return a <code>Map</code> containing the name/value entries to be output.
276 * @see #replace(Attributes,Map)
277 */
278 public Map<String,String> replace(final Attributes attributes, boolean convertNamesToLowerCase) {
279 AttributesOutputSegment attributesOutputSegment=new AttributesOutputSegment(attributes,convertNamesToLowerCase);
280 register(attributesOutputSegment);
281 return attributesOutputSegment.getMap();
282 }
283
284 /**
285 * Replaces the specified attributes segment in this source document with the name/value entries in the specified <code>Map</code>.
286 * <p>
287 * This method might be used if the <code>Map</code> containing the new attribute values
288 * should not be preloaded with the same entries as the source attributes, or a map implementation
289 * other than <code>LinkedHashMap</code> is required.
290 * Otherwise, the {@link #replace(Attributes, boolean convertNamesToLowerCase)} method is generally more useful.
291 * <p>
292 * An attribute with no value is represented by a map entry with a <code>null</code> value.
293 * <p>
294 * Attribute values are stored unencoded in the map, and are automatically
295 * {@linkplain CharacterReference#encode(CharSequence) encoded} if necessary during output.
296 * <p>
297 * The use of invalid characters in attribute names results in unspecified behaviour.
298 * <p>
299 * Note that methods in the <code>Attributes</code> class treat attribute names as case insensitive,
300 * whereas the <code>Map</code> treats them as case sensitive.
301 *
302 * @param attributes the <code>Attributes</code> object defining the span of the segment to replace.
303 * @param map the <code>Map</code> containing the name/value entries.
304 * @see #replace(Attributes, boolean convertNamesToLowerCase)
305 */
306 public void replace(final Attributes attributes, final Map<String,String> map) {
307 register(new AttributesOutputSegment(attributes,map));
308 }
309
310 /**
311 * Replaces the specified segment of this output document with a string of spaces of the same length.
312 * <p>
313 * This method is most commonly used to remove segments of the document without affecting the character positions of the remaining elements.
314 * <p>
315 * It is used internally to implement the functionality available through the {@link Segment#ignoreWhenParsing()} method.
316 * <p>
317 * To remove a segment from the output document completely, use the {@link #remove(Segment)} method instead.
318 *
319 * @param begin the character position at which to begin the replacement.
320 * @param end the character position at which to end the replacement.
321 */
322 public void replaceWithSpaces(final int begin, final int end) {
323 register(new BlankOutputSegment(begin,end));
324 }
325
326 /**
327 * Registers the specified {@linkplain OutputSegment output segment} in this output document.
328 * <p>
329 * Use this method if you want to use a customised {@link OutputSegment} class.
330 *
331 * @param outputSegment the output segment to register.
332 */
333 public void register(final OutputSegment outputSegment) {
334 outputSegments.add(outputSegment);
335 }
336
337 /**
338 * Writes the final content of this output document to the specified <code>Writer</code>.
339 * <p>
340 * The {@link #writeTo(Writer, int begin, int end)} method allows the output of a portion of the output document.
341 * <p>
342 * If the output is required in the form of a <code>Reader</code>, use {@link CharStreamSourceUtil#getReader(CharStreamSource) CharStreamSourceUtil.getReader(this)} instead.
343 *
344 * @param writer the destination <code>java.io.Writer</code> for the output.
345 * @throws IOException if an I/O exception occurs.
346 * @see #toString()
347 */
348 public void writeTo(final Writer writer) throws IOException {
349 try {
350 appendTo(writer);
351 } finally {
352 writer.flush();
353 }
354 }
355
356 /**
357 * Writes the specified portion of the final content of this output document to the specified <code>Writer</code>.
358 * <p>
359 * Any zero-length output segments located at <code>begin</code> or <code>end</code> are included in the output.
360 *
361 * @param writer the destination <code>java.io.Writer</code> for the output.
362 * @param begin the character position at which to start the output, inclusive.
363 * @param end the character position at which to end the output, exclusive.
364 * @throws IOException if an I/O exception occurs.
365 * @see #writeTo(Writer)
366 */
367 public void writeTo(final Writer writer, final int begin, final int end) throws IOException {
368 try {
369 appendTo(writer,begin,end);
370 } finally {
371 writer.flush();
372 }
373 }
374
375 /**
376 * Appends the final content of this output document to the specified <code>Appendable</code> object.
377 * <p>
378 * The {@link #appendTo(Appendable, int begin, int end)} method allows the output of a portion of the output document.
379 *
380 * @param appendable the destination <code>java.lang.Appendable</code> object for the output.
381 * @throws IOException if an I/O exception occurs.
382 * @see #toString()
383 */
384 public void appendTo(final Appendable appendable) throws IOException {
385 appendTo(appendable,0,sourceText.length());
386 }
387
388 /**
389 * Appends the specified portion of the final content of this output document to the specified <code>Appendable</code> object.
390 * <p>
391 * Any zero-length output segments located at <code>begin</code> or <code>end</code> are included in the output.
392 *
393 * @param appendable the destination <code>java.lang.Appendable</code> object for the output.
394 * @param begin the character position at which to start the output, inclusive.
395 * @param end the character position at which to end the output, exclusive.
396 * @throws IOException if an I/O exception occurs.
397 * @see #appendTo(Appendable)
398 */
399 public void appendTo(final Appendable appendable, final int begin, final int end) throws IOException {
400 if (outputSegments.isEmpty()) {
401 appendable.append(sourceText,begin,end);
402 return;
403 }
404 int pos=begin;
405 Collections.sort(outputSegments,OutputSegment.COMPARATOR);
406 for (OutputSegment outputSegment : outputSegments) {
407 if (outputSegment.getEnd()<pos) continue; // skip output segments before begin, and any that are enclosed by other output segments
408 if (outputSegment.getEnd()==pos && outputSegment.getBegin()<pos) continue; // skip output segments that end at pos unless they are zero length
409 if (outputSegment.getBegin()>end) break; // stop processing output segments if they are not longer in the desired output range
410 if (outputSegment.getBegin()==end && outputSegment.getEnd()>end) break; // stop processing output segments if they start at end unless they are zero length
411 if (outputSegment.getBegin()>pos) {
412 appendable.append(sourceText,pos,outputSegment.getBegin());
413 }
414 if (outputSegment.getBegin()<pos && outputSegment instanceof BlankOutputSegment) {
415 // Overlapping BlankOutputSegments requires special handling to ensure the correct number of blanks are inserted.
416 for (final int outputSegmentEnd=outputSegment.getEnd(); pos<outputSegmentEnd; pos++) appendable.append(' ');
417 } else {
418 outputSegment.appendTo(appendable);
419 pos=outputSegment.getEnd();
420 }
421 }
422 if (pos<end) appendable.append(sourceText,pos,end);
423 }
424
425 // Documentation inherited from CharStreamSource
426 public long getEstimatedMaximumOutputLength() {
427 long estimatedMaximumOutputLength=sourceText.length();
428 for (OutputSegment outputSegment : outputSegments) {
429 final int outputSegmentOriginalLength=outputSegment.getEnd()-outputSegment.getBegin();
430 estimatedMaximumOutputLength+=(outputSegment.getEstimatedMaximumOutputLength()-outputSegmentOriginalLength);
431 }
432 return estimatedMaximumOutputLength>=0L ? estimatedMaximumOutputLength : -1L;
433 }
434
435 /**
436 * Returns the final content of this output document as a <code>String</code>.
437 * @return the final content of this output document as a <code>String</code>.
438 * @see #writeTo(Writer)
439 */
440 public String toString() {
441 return CharStreamSourceUtil.toString(this);
442 }
443
444 /**
445 * Returns a string representation of this object useful for debugging purposes.
446 * <p>
447 * The output includes details of all the {@link #getRegisteredOutputSegments() registered output segments}.
448 *
449 * @return a string representation of this object useful for debugging purposes.
450 */
451 public String getDebugInfo() {
452 StringBuilder sb=new StringBuilder();
453 for (OutputSegment outputSegment : getRegisteredOutputSegments()) {
454 if (outputSegment instanceof BlankOutputSegment)
455 sb.append("Replace with Spaces: ");
456 else if (outputSegment instanceof RemoveOutputSegment)
457 sb.append("Remove: ");
458 else
459 sb.append("Replace: ");
460 if (sourceText instanceof Source) {
461 Source source=(Source)sourceText;
462 sb.append('(');
463 source.getRowColumnVector(outputSegment.getBegin()).appendTo(sb);
464 sb.append('-');
465 source.getRowColumnVector(outputSegment.getEnd()).appendTo(sb);
466 sb.append(')');
467 } else {
468 sb.append("(p").append(outputSegment.getBegin()).append("-p").append(outputSegment.getEnd()).append(')');
469 }
470 sb.append(' ');
471 String outputFromSegment=outputSegment.toString();
472 if (outputFromSegment.length()<=20) {
473 sb.append(outputFromSegment);
474 } else {
475 sb.append(outputFromSegment.substring(0,20)).append("...");
476 }
477 sb.append(Config.NewLine);
478 }
479 return sb.toString();
480 }
481
482 /**
483 * Returns a list all of the {@linkplain #register(OutputSegment) registered} {@link OutputSegment} objects in this output document.
484 * <p>
485 * The output segments are sorted in order of their {@linkplain OutputSegment#getBegin() starting position} in the document.
486 * <p>
487 * The returned list is modifiable and any changes will affect the output generated by this <code>OutputDocument</code>.
488 *
489 * @return a list all of the {@linkplain #register(OutputSegment) registered} {@link OutputSegment} objects in this output document.
490 */
491 public List<OutputSegment> getRegisteredOutputSegments() {
492 Collections.sort(outputSegments,OutputSegment.COMPARATOR);
493 return outputSegments;
494 }
495 }

   
Visit the aagtl Website