/[aagtl_public1]/src/net/htmlparser/jericho/Renderer.java
aagtl

Contents of /src/net/htmlparser/jericho/Renderer.java

Parent Directory Parent Directory | Revision Log Revision Log


Revision 2 - (show annotations) (download)
Sun Aug 5 13:48:36 2012 UTC (11 years, 8 months ago) by zoffadmin
File size: 59652 byte(s)
initial import of aagtl source code
1 // Jericho HTML Parser - Java based library for analysing and manipulating HTML
2 // Version 3.2
3 // Copyright (C) 2004-2009 Martin Jericho
4 // http://jericho.htmlparser.net/
5 //
6 // This library is free software; you can redistribute it and/or
7 // modify it under the terms of either one of the following licences:
8 //
9 // 1. The Eclipse Public License (EPL) version 1.0,
10 // included in this distribution in the file licence-epl-1.0.html
11 // or available at http://www.eclipse.org/legal/epl-v10.html
12 //
13 // 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
14 // included in this distribution in the file licence-lgpl-2.1.txt
15 // or available at http://www.gnu.org/licenses/lgpl.txt
16 //
17 // This library is distributed on an "AS IS" basis,
18 // WITHOUT WARRANTY OF ANY KIND, either express or implied.
19 // See the individual licence texts for more details.
20
21 package net.htmlparser.jericho;
22
23 import java.util.*;
24 import java.io.*;
25
26 /**
27 * Performs a simple rendering of HTML markup into text.
28 * <p>
29 * This provides a human readable version of the segment content that is modelled on the way
30 * <a target="_blank" href="http://www.mozilla.com/thunderbird/">Mozilla Thunderbird</a> and other email clients provide an automatic conversion of
31 * HTML content to text in their <a target="_blank" href="http://tools.ietf.org/html/rfc2046#section-5.1.4">alternative MIME encoding</a> of emails.
32 * <p>
33 * The output using default settings complies with the "text/plain; format=flowed" (DelSp=No) protocol described in
34 * <a target="_blank" href="http://tools.ietf.org/html/rfc3676">RFC3676</a>.
35 * <p>
36 * Many properties are available to customise the output, possibly the most significant of which being {@link #setMaxLineLength(int) MaxLineLength}.
37 * See the individual property descriptions for details.
38 * <p>
39 * Use one of the following methods to obtain the output:
40 * <ul>
41 * <li>{@link #writeTo(Writer)}</li>
42 * <li>{@link #appendTo(Appendable)}</li>
43 * <li>{@link #toString()}</li>
44 * <li>{@link CharStreamSourceUtil#getReader(CharStreamSource) CharStreamSourceUtil.getReader(this)}</li>
45 * </ul>
46 * <p>
47 * The rendering of some constructs, especially tables, is very rudimentary.
48 * No attempt is made to render nested tables properly, except to ensure that all of the text content is included in the output.
49 * <p>
50 * Rendering an entire {@link Source} object performs a {@linkplain Source#fullSequentialParse() full sequential parse} automatically.
51 * <p>
52 * Any aspect of the algorithm not specifically mentioned here is subject to change without notice in future versions.
53 * <p>
54 * To extract pure text without any rendering of the markup, use the {@link TextExtractor} class instead.
55 */
56 public class Renderer implements CharStreamSource {
57 private final Segment rootSegment;
58 private int maxLineLength=76;
59 private String newLine="\r\n";
60 private boolean includeHyperlinkURLs=true;
61 private boolean includeAlternateText=true;
62 private boolean decorateFontStyles=false;
63 private boolean convertNonBreakingSpaces=Config.ConvertNonBreakingSpaces;
64 private int blockIndentSize=4;
65 private int listIndentSize=6;
66 private char[] listBullets=new char[] {'*','o','+','#'};
67 private boolean includeFirstElementTopMargin=false;
68 private String tableCellSeparator=" \t";
69
70 private static final int UNORDERED_LIST=-1;
71
72 private static Map<String,ElementHandler> ELEMENT_HANDLERS=new HashMap<String,ElementHandler>();
73 static {
74 ELEMENT_HANDLERS.put(HTMLElementName.A,A_ElementHandler.INSTANCE);
75 ELEMENT_HANDLERS.put(HTMLElementName.ADDRESS,StandardBlockElementHandler.INSTANCE_0_0);
76 ELEMENT_HANDLERS.put(HTMLElementName.APPLET,AlternateTextElementHandler.INSTANCE);
77 ELEMENT_HANDLERS.put(HTMLElementName.B,FontStyleElementHandler.INSTANCE_B);
78 ELEMENT_HANDLERS.put(HTMLElementName.BLOCKQUOTE,StandardBlockElementHandler.INSTANCE_1_1_INDENT);
79 ELEMENT_HANDLERS.put(HTMLElementName.BR,BR_ElementHandler.INSTANCE);
80 ELEMENT_HANDLERS.put(HTMLElementName.BUTTON,RemoveElementHandler.INSTANCE);
81 ELEMENT_HANDLERS.put(HTMLElementName.CAPTION,StandardBlockElementHandler.INSTANCE_0_0);
82 ELEMENT_HANDLERS.put(HTMLElementName.CENTER,StandardBlockElementHandler.INSTANCE_1_1);
83 ELEMENT_HANDLERS.put(HTMLElementName.CODE,FontStyleElementHandler.INSTANCE_CODE);
84 ELEMENT_HANDLERS.put(HTMLElementName.DD,StandardBlockElementHandler.INSTANCE_0_0_INDENT);
85 ELEMENT_HANDLERS.put(HTMLElementName.DIR,ListElementHandler.INSTANCE_UL);
86 ELEMENT_HANDLERS.put(HTMLElementName.DIV,StandardBlockElementHandler.INSTANCE_0_0);
87 ELEMENT_HANDLERS.put(HTMLElementName.DT,StandardBlockElementHandler.INSTANCE_0_0);
88 ELEMENT_HANDLERS.put(HTMLElementName.EM,FontStyleElementHandler.INSTANCE_I);
89 ELEMENT_HANDLERS.put(HTMLElementName.FIELDSET,StandardBlockElementHandler.INSTANCE_1_1);
90 ELEMENT_HANDLERS.put(HTMLElementName.FORM,StandardBlockElementHandler.INSTANCE_1_1);
91 ELEMENT_HANDLERS.put(HTMLElementName.H1,StandardBlockElementHandler.INSTANCE_2_1);
92 ELEMENT_HANDLERS.put(HTMLElementName.H2,StandardBlockElementHandler.INSTANCE_2_1);
93 ELEMENT_HANDLERS.put(HTMLElementName.H3,StandardBlockElementHandler.INSTANCE_2_1);
94 ELEMENT_HANDLERS.put(HTMLElementName.H4,StandardBlockElementHandler.INSTANCE_2_1);
95 ELEMENT_HANDLERS.put(HTMLElementName.H5,StandardBlockElementHandler.INSTANCE_2_1);
96 ELEMENT_HANDLERS.put(HTMLElementName.H6,StandardBlockElementHandler.INSTANCE_2_1);
97 ELEMENT_HANDLERS.put(HTMLElementName.HEAD,RemoveElementHandler.INSTANCE);
98 ELEMENT_HANDLERS.put(HTMLElementName.HR,HR_ElementHandler.INSTANCE);
99 ELEMENT_HANDLERS.put(HTMLElementName.I,FontStyleElementHandler.INSTANCE_I);
100 ELEMENT_HANDLERS.put(HTMLElementName.IMG,AlternateTextElementHandler.INSTANCE);
101 ELEMENT_HANDLERS.put(HTMLElementName.INPUT,AlternateTextElementHandler.INSTANCE);
102 ELEMENT_HANDLERS.put(HTMLElementName.LEGEND,StandardBlockElementHandler.INSTANCE_0_0);
103 ELEMENT_HANDLERS.put(HTMLElementName.LI,LI_ElementHandler.INSTANCE);
104 ELEMENT_HANDLERS.put(HTMLElementName.MENU,ListElementHandler.INSTANCE_UL);
105 ELEMENT_HANDLERS.put(HTMLElementName.MAP,RemoveElementHandler.INSTANCE);
106 ELEMENT_HANDLERS.put(HTMLElementName.NOFRAMES,RemoveElementHandler.INSTANCE);
107 ELEMENT_HANDLERS.put(HTMLElementName.NOSCRIPT,RemoveElementHandler.INSTANCE);
108 ELEMENT_HANDLERS.put(HTMLElementName.OL,ListElementHandler.INSTANCE_OL);
109 ELEMENT_HANDLERS.put(HTMLElementName.P,StandardBlockElementHandler.INSTANCE_1_1);
110 ELEMENT_HANDLERS.put(HTMLElementName.PRE,PRE_ElementHandler.INSTANCE);
111 ELEMENT_HANDLERS.put(HTMLElementName.SCRIPT,RemoveElementHandler.INSTANCE);
112 ELEMENT_HANDLERS.put(HTMLElementName.SELECT,RemoveElementHandler.INSTANCE);
113 ELEMENT_HANDLERS.put(HTMLElementName.STRONG,FontStyleElementHandler.INSTANCE_B);
114 ELEMENT_HANDLERS.put(HTMLElementName.STYLE,RemoveElementHandler.INSTANCE);
115 ELEMENT_HANDLERS.put(HTMLElementName.TEXTAREA,RemoveElementHandler.INSTANCE);
116 ELEMENT_HANDLERS.put(HTMLElementName.TD,TD_ElementHandler.INSTANCE);
117 ELEMENT_HANDLERS.put(HTMLElementName.TH,TD_ElementHandler.INSTANCE);
118 ELEMENT_HANDLERS.put(HTMLElementName.TR,StandardBlockElementHandler.INSTANCE_0_0);
119 ELEMENT_HANDLERS.put(HTMLElementName.U,FontStyleElementHandler.INSTANCE_U);
120 ELEMENT_HANDLERS.put(HTMLElementName.UL,ListElementHandler.INSTANCE_UL);
121 }
122
123 /**
124 * Constructs a new <code>Renderer</code> based on the specified {@link Segment}.
125 * @param segment the segment containing the HTML to be rendered.
126 * @see Segment#getRenderer()
127 */
128 public Renderer(final Segment segment) {
129 rootSegment=segment;
130 }
131
132 // Documentation inherited from CharStreamSource
133 public void writeTo(final Writer writer) throws IOException {
134 appendTo(writer);
135 writer.flush();
136 }
137
138 // Documentation inherited from CharStreamSource
139 public void appendTo(final Appendable appendable) throws IOException {
140 new Processor(this,rootSegment,getMaxLineLength(),getNewLine(),getIncludeHyperlinkURLs(),getIncludeAlternateText(),getDecorateFontStyles(),getConvertNonBreakingSpaces(),getBlockIndentSize(),getListIndentSize(),getListBullets(),getTableCellSeparator()).appendTo(appendable);
141 }
142
143 // Documentation inherited from CharStreamSource
144 public long getEstimatedMaximumOutputLength() {
145 return rootSegment.length();
146 }
147
148 // Documentation inherited from CharStreamSource
149 public String toString() {
150 return CharStreamSourceUtil.toString(this);
151 }
152
153 /**
154 * Sets the column at which lines are to be wrapped.
155 * <p>
156 * Lines that would otherwise exceed this length are wrapped onto a new line at a word boundary.
157 * <p>
158 * A Line may still exceed this length if it consists of a single word, where the length of the word plus the line indent exceeds the maximum length.
159 * In this case the line is wrapped immediately after the end of the word.
160 * <p>
161 * The default value is <code>76</code>, which reflects the maximum line length for sending
162 * email data specified in <a target="_blank" href="http://rfc.net/rfc2049.html#s3.">RFC2049 section 3.5</a>.
163 *
164 * @param maxLineLength the column at which lines are to be wrapped.
165 * @return this <code>Renderer</code> instance, allowing multiple property setting methods to be chained in a single statement.
166 * @see #getMaxLineLength()
167 */
168 public Renderer setMaxLineLength(final int maxLineLength) {
169 this.maxLineLength=maxLineLength;
170 return this;
171 }
172
173 /**
174 * Returns the column at which lines are to be wrapped.
175 * <p>
176 * See the {@link #setMaxLineLength(int)} method for a full description of this property.
177 *
178 * @return the column at which lines are to be wrapped.
179 */
180 public int getMaxLineLength() {
181 return maxLineLength;
182 }
183
184 /**
185 * Sets the string to be used to represent a <a target="_blank" href="http://en.wikipedia.org/wiki/Newline">newline</a> in the output.
186 * <p>
187 * The default value is <code>"\r\n"</code> <span title="carriage return + line feed">(CR+LF)</span> regardless of the platform on which the library is running.
188 * This is so that the default configuration produces valid
189 * <a target="_blank" href="http://tools.ietf.org/html/rfc1521#section-7.1.2">MIME plain/text</a> output, which mandates the use of CR+LF for line breaks.
190 * <p>
191 * Specifying a <code>null</code> argument causes the output to use same new line string as is used in the source document, which is
192 * determined via the {@link Source#getNewLine()} method.
193 * If the source document does not contain any new lines, a "best guess" is made by either taking the new line string of a previously parsed document,
194 * or using the value from the static {@link Config#NewLine} property.
195 *
196 * @param newLine the string to be used to represent a <a target="_blank" href="http://en.wikipedia.org/wiki/Newline">newline</a> in the output, may be <code>null</code>.
197 * @return this <code>Renderer</code> instance, allowing multiple property setting methods to be chained in a single statement.
198 * @see #getNewLine()
199 */
200 public Renderer setNewLine(final String newLine) {
201 this.newLine=newLine;
202 return this;
203 }
204
205 /**
206 * Returns the string to be used to represent a <a target="_blank" href="http://en.wikipedia.org/wiki/Newline">newline</a> in the output.
207 * <p>
208 * See the {@link #setNewLine(String)} method for a full description of this property.
209 *
210 * @return the string to be used to represent a <a target="_blank" href="http://en.wikipedia.org/wiki/Newline">newline</a> in the output.
211 */
212 public String getNewLine() {
213 if (newLine==null) newLine=rootSegment.source.getBestGuessNewLine();
214 return newLine;
215 }
216
217 /**
218 * Sets whether hyperlink URLs are included in the output.
219 * <p>
220 * The default value is <code>true</code>.
221 * <p>
222 * When this property is <code>true</code>, the URL of each hyperlink is included in the output as determined by the implementation of the
223 * {@link #renderHyperlinkURL(StartTag)} method.
224 * <p>
225 * <dl>
226 * <dt>Example:</dt>
227 * <dd>
228 * <p>
229 * Assuming the default implementation of {@link #renderHyperlinkURL(StartTag)}, when this property is <code>true</code>, the following HTML:
230 * <blockquote class="code">
231 * <code>&lt;a href="http://jericho.htmlparser.net/"&gt;Jericho HTML Parser&lt;/a&gt;</code>
232 * </blockquote>
233 * produces the following output:
234 * <blockquote class="code">
235 * <code>Jericho HTML Parser &lt;http://jericho.htmlparser.net/&gt;</code>
236 * </blockquote>
237 * </dd>
238 * </dl>
239 *
240 * @param includeHyperlinkURLs specifies whether hyperlink URLs are included in the output.
241 * @return this <code>Renderer</code> instance, allowing multiple property setting methods to be chained in a single statement.
242 * @see #getIncludeHyperlinkURLs()
243 */
244 public Renderer setIncludeHyperlinkURLs(final boolean includeHyperlinkURLs) {
245 this.includeHyperlinkURLs=includeHyperlinkURLs;
246 return this;
247 }
248
249 /**
250 * Indicates whether hyperlink URLs are included in the output.
251 * <p>
252 * See the {@link #setIncludeHyperlinkURLs(boolean)} method for a full description of this property.
253 *
254 * @return <code>true</code> if hyperlink URLs are included in the output, otherwise <code>false</code>.
255 */
256 public boolean getIncludeHyperlinkURLs() {
257 return includeHyperlinkURLs;
258 }
259
260 /**
261 * Renders the hyperlink URL from the specified {@link StartTag}.
262 * <p>
263 * A return value of <code>null</code> indicates that the hyperlink URL should not be rendered at all.
264 * <p>
265 * The default implementation of this method returns <code>null</code> if the <code>href</code> attribute of the specified start tag
266 * is '<code>#</code>', starts with "<code>javascript:</code>", or is missing.
267 * In all other cases it returns the value of the <code>href</code> attribute enclosed in angle brackets.
268 * <p>
269 * See the documentation of the {@link #setIncludeHyperlinkURLs(boolean)} method for an example of how a hyperlink is rendered by the default implementation.
270 * <p>
271 * This method can be overridden in a subclass to customise the rendering of hyperlink URLs.
272 * <p>
273 * Rendering of hyperlink URLs can be disabled completely without overriding this method by setting the
274 * {@link #setIncludeHyperlinkURLs(boolean) IncludeHyperlinkURLs} property to <code>false</code>.
275 * <p>
276 * <dl>
277 * <dt>Example:</dt>
278 * <dd>
279 * To render hyperlink URLs without the enclosing angle brackets:<br /><br />
280 * <code>
281 * Renderer renderer=new Renderer(segment) {<br />
282 * &nbsp; &nbsp; public String renderHyperlinkURL(StartTag startTag) {<br />
283 * &nbsp; &nbsp; &nbsp; &nbsp; String href=startTag.getAttributeValue("href");<br />
284 * &nbsp; &nbsp; &nbsp; &nbsp; if (href==null || href.equals("#") || href.startsWith("javascript:")) return null;<br />
285 * &nbsp; &nbsp; &nbsp; &nbsp; return href;<br />
286 * &nbsp; &nbsp; }<br />
287 * };<br />
288 * String renderedSegment=renderer.toString();
289 * </code>
290 * </dd>
291 * </dl>
292 * @param startTag the start tag of the hyperlink element, must not be <code>null</code>.
293 * @return The rendered hyperlink URL from the specified {@link StartTag}, or <code>null</code> if the hyperlink URL should not be rendered.
294 */
295 public String renderHyperlinkURL(final StartTag startTag) {
296 final String href=startTag.getAttributeValue("href");
297 if (href==null || href.equals("#") || href.startsWith("javascript:")) return null;
298 return '<'+href+'>';
299 }
300
301 /**
302 * Sets whether the alternate text of a tag that has an <code>alt</code> attribute is included in the output.
303 * <p>
304 * The default value is <code>true</code>.
305 * Note that this is not conistent with common email clients such as Mozilla Thunderbird which do not render alternate text at all,
306 * even when a tag specifies alternate text.
307 * <p>
308 * When this property is <code>true</code>, the alternate text is included in the output as determined by the implementation of the
309 * {@link #renderAlternateText(StartTag)} method.
310 * <p>
311 * <dl>
312 * <dt>Example:</dt>
313 * <dd>
314 * <p>
315 * Assuming the default implementation of {@link #renderAlternateText(StartTag)}, when this property is <code>true</code>, the following HTML:
316 * <blockquote class="code">
317 * <code>&lt;img src="smiley.png" alt="smiley face" /&gt;</code>
318 * </blockquote>
319 * produces the following output:
320 * <blockquote class="code">
321 * <code>[smiley face]</code>
322 * </blockquote>
323 * </dd>
324 * </dl>
325 *
326 * @param includeAlternateText specifies whether the alternate text of a tag that has an <code>alt</code> attribute is included in the output.
327 * @return this <code>Renderer</code> instance, allowing multiple property setting methods to be chained in a single statement.
328 * @see #getIncludeAlternateText()
329 */
330 public Renderer setIncludeAlternateText(final boolean includeAlternateText) {
331 this.includeAlternateText=includeAlternateText;
332 return this;
333 }
334
335 /**
336 * Indicates whether the alternate text of a tag that has an <code>alt</code> attribute is included in the output.
337 * <p>
338 * See the {@link #setIncludeAlternateText(boolean)} method for a full description of this property.
339 *
340 * @return <code>true</code> if the alternate text of a tag that has an <code>alt</code> attribute is included in the output, otherwise <code>false</code>.
341 */
342 public boolean getIncludeAlternateText() {
343 return includeAlternateText;
344 }
345
346 /**
347 * Renders the alternate text of the specified start tag.
348 * <p>
349 * A return value of <code>null</code> indicates that the alternate text is not to be rendered at all.
350 * <p>
351 * The default implementation of this method returns <code>null</code> if the <code>alt</code> attribute of the specified start tag is missing or empty, or if the
352 * specified start tag is from an {@link HTMLElementName#AREA AREA} element.
353 * In all other cases it returns the value of the <code>alt</code> attribute enclosed in square brackets <code>[&hellip;]</code>.
354 * <p>
355 * See the documentation of the {@link #setIncludeAlternateText(boolean)} method for an example of how alternate text is rendered by the default implementation.
356 * <p>
357 * This method can be overridden in a subclass to customise the rendering of alternate text.
358 * <p>
359 * Rendering of alternate text can be disabled completely without overriding this method by setting the
360 * {@link #setIncludeAlternateText(boolean) IncludeAlternateText} property to <code>false</code>.
361 * <p>
362 * <dl>
363 * <dt>Example:</dt>
364 * <dd>
365 * To render alternate text with double angle quotation marks instead of square brackets:<br /><br />
366 * <code>
367 * Renderer renderer=new Renderer(segment) {<br />
368 * &nbsp; &nbsp; public String renderAlternateText(StartTag startTag) {<br />
369 * &nbsp; &nbsp; &nbsp; &nbsp; if (startTag.getName()==HTMLElementName.AREA) return null;
370 * &nbsp; &nbsp; &nbsp; &nbsp; String alt=startTag.getAttributeValue("alt");<br />
371 * &nbsp; &nbsp; &nbsp; &nbsp; if (alt==null || alt.length()==0) return null;<br />
372 * &nbsp; &nbsp; &nbsp; &nbsp; return '«'+alt+'»';<br />
373 * &nbsp; &nbsp; }<br />
374 * };<br />
375 * String renderedSegment=renderer.toString();
376 * </code>
377 * </dd>
378 * </dl>
379 * @param startTag the start tag containing an <code>alt</code> attribute, must not be <code>null</code>.
380 * @return The rendered alternate text, or <code>null</code> if the alternate text should not be rendered.
381 */
382 public String renderAlternateText(final StartTag startTag) {
383 if (startTag.getName()==HTMLElementName.AREA) return null;
384 final String alt=startTag.getAttributeValue("alt");
385 if (alt==null || alt.length()==0) return null;
386 return '['+alt+']';
387 }
388
389 /**
390 * Sets whether decoration characters are to be included around the content of some
391 * <a target="_blank" href="http://www.w3.org/TR/html401/present/graphics.html#h-15.2.1">font style elements</a> and
392 * <a target="_blank" href="http://www.w3.org/TR/html401/struct/text.html#h-9.2.1">phrase elements</a>.
393 * <p>
394 * The default value is <code>false</code>.
395 * <p>
396 * Below is a table summarising the decorated elements.
397 * <p>
398 * <style type="text/css">
399 * table#FontStyleElementSummary td, table#FontStyleElementSummary th {text-align: center; padding-bottom: 2px}
400 * </style>
401 * <table id="FontStyleElementSummary" class="bordered" cellspacing="0">
402 * <tr><th title="HTML elements decorated">Elements</th><th title="The character placed around the element content">Character</th><th>Example Output</th></tr>
403 * <tr><td>{@link HTMLElementName#B B} and {@link HTMLElementName#STRONG STRONG}</td><td><code>*</code></td><td><code>*bold text*</code></td></tr>
404 * <tr><td>{@link HTMLElementName#I I} and {@link HTMLElementName#EM EM}</td><td><code>/</code></td><td><code>/italic text/</code></td></tr>
405 * <tr><td>{@link HTMLElementName#U U}</td><td><code>_</code></td><td><code>_underlined text_</code></td></tr>
406 * <tr><td>{@link HTMLElementName#CODE CODE}</td><td><code>|</code></td><td><code>|code|</code></td></tr>
407 * </table>
408 *
409 * @param decorateFontStyles specifies whether decoration characters are to be included around the content of some font style elements.
410 * @return this <code>Renderer</code> instance, allowing multiple property setting methods to be chained in a single statement.
411 * @see #getDecorateFontStyles()
412 */
413 public Renderer setDecorateFontStyles(final boolean decorateFontStyles) {
414 this.decorateFontStyles=decorateFontStyles;
415 return this;
416 }
417
418 /**
419 * Indicates whether decoration characters are to be included around the content of some
420 * <a target="_blank" href="http://www.w3.org/TR/html401/present/graphics.html#h-15.2.1">font style elements</a> and
421 * <a target="_blank" href="http://www.w3.org/TR/html401/struct/text.html#h-9.2.1">phrase elements</a>.
422 * <p>
423 * See the {@link #setDecorateFontStyles(boolean)} method for a full description of this property.
424 *
425 * @return <code>true</code> if decoration characters are to be included around the content of some font style elements, otherwise <code>false</code>.
426 */
427 public boolean getDecorateFontStyles() {
428 return decorateFontStyles;
429 }
430
431 /**
432 * Sets whether non-breaking space ({@link CharacterEntityReference#_nbsp &amp;nbsp;}) character entity references are converted to spaces.
433 * <p>
434 * The default value is that of the static {@link Config#ConvertNonBreakingSpaces} property at the time the <code>Renderer</code> is instantiated.
435 *
436 * @param convertNonBreakingSpaces specifies whether non-breaking space ({@link CharacterEntityReference#_nbsp &amp;nbsp;}) character entity references are converted to spaces.
437 * @return this <code>Renderer</code> instance, allowing multiple property setting methods to be chained in a single statement.
438 * @see #getConvertNonBreakingSpaces()
439 */
440 public Renderer setConvertNonBreakingSpaces(boolean convertNonBreakingSpaces) {
441 this.convertNonBreakingSpaces=convertNonBreakingSpaces;
442 return this;
443 }
444
445 /**
446 * Indicates whether non-breaking space ({@link CharacterEntityReference#_nbsp &amp;nbsp;}) character entity references are converted to spaces.
447 * <p>
448 * See the {@link #setConvertNonBreakingSpaces(boolean)} method for a full description of this property.
449 *
450 * @return <code>true</code> if non-breaking space ({@link CharacterEntityReference#_nbsp &amp;nbsp;}) character entity references are converted to spaces, otherwise <code>false</code>.
451 */
452 public boolean getConvertNonBreakingSpaces() {
453 return convertNonBreakingSpaces;
454 }
455
456 /**
457 * Sets the size of the indent to be used for anything other than {@link HTMLElementName#LI LI} elements.
458 * <p>
459 * At present this applies to {@link HTMLElementName#BLOCKQUOTE BLOCKQUOTE} and {@link HTMLElementName#DD DD} elements.
460 * <p>
461 * The default value is <code>4</code>.
462 *
463 * @param blockIndentSize the size of the indent.
464 * @return this <code>Renderer</code> instance, allowing multiple property setting methods to be chained in a single statement.
465 * @see #getBlockIndentSize()
466 */
467 public Renderer setBlockIndentSize(final int blockIndentSize) {
468 this.blockIndentSize=blockIndentSize;
469 return this;
470 }
471
472 /**
473 * Returns the size of the indent to be used for anything other than {@link HTMLElementName#LI LI} elements.
474 * <p>
475 * See the {@link #setBlockIndentSize(int)} method for a full description of this property.
476 *
477 * @return the size of the indent to be used for anything other than {@link HTMLElementName#LI LI} elements.
478 */
479 public int getBlockIndentSize() {
480 return blockIndentSize;
481 }
482
483 /**
484 * Sets the size of the indent to be used for {@link HTMLElementName#LI LI} elements.
485 * <p>
486 * The default value is <code>6</code>.
487 * <p>
488 * This applies to {@link HTMLElementName#LI LI} elements inside both {@link HTMLElementName#UL UL} and {@link HTMLElementName#OL OL} elements.
489 * <p>
490 * The bullet or number of the list item is included as part of the indent.
491 *
492 * @param listIndentSize the size of the indent.
493 * @return this <code>Renderer</code> instance, allowing multiple property setting methods to be chained in a single statement.
494 * @see #getListIndentSize()
495 */
496 public Renderer setListIndentSize(final int listIndentSize) {
497 this.listIndentSize=listIndentSize;
498 return this;
499 }
500
501 /**
502 * Returns the size of the indent to be used for {@link HTMLElementName#LI LI} elements.
503 * <p>
504 * See the {@link #setListIndentSize(int)} method for a full description of this property.
505 *
506 * @return the size of the indent to be used for {@link HTMLElementName#LI LI} elements.
507 */
508 public int getListIndentSize() {
509 return listIndentSize;
510 }
511
512 /**
513 * Sets the bullet characters to use for list items inside {@link HTMLElementName#UL UL} elements.
514 * <p>
515 * The values in the default array are <code>*</code>, <code>o</code>, <code>+</code> and <code>#</code>.
516 * <p>
517 * If the nesting of rendered lists goes deeper than the length of this array, the bullet characters start repeating from the first in the array.
518 * <p>
519 * WARNING: If any of the characters in the default array are modified, this will affect all other instances of this class using the default array.
520 *
521 * @param listBullets an array of characters to be used as bullets, must have at least one entry.
522 * @return this <code>Renderer</code> instance, allowing multiple property setting methods to be chained in a single statement.
523 * @see #getListBullets()
524 */
525 public Renderer setListBullets(final char[] listBullets) {
526 if (listBullets==null || listBullets.length==0) throw new IllegalArgumentException("listBullets argument must be an array of at least one character");
527 this.listBullets=listBullets;
528 return this;
529 }
530
531 /**
532 * Returns the bullet characters to use for list items inside {@link HTMLElementName#UL UL} elements.
533 * <p>
534 * See the {@link #setListBullets(char[])} method for a full description of this property.
535 *
536 * @return the bullet characters to use for list items inside {@link HTMLElementName#UL UL} elements.
537 */
538 public char[] getListBullets() {
539 return listBullets;
540 }
541
542 /**
543 * Sets whether the top margin of the first element is rendered.
544 * <p>
545 * The default value is <code>false</code>.
546 * <p>
547 * If this property is set to <code>true</code>, then the source "<code>&lt;h1&gt;Heading&lt;/h1&gt;</code>" would be rendered as "<code>\r\n\r\nHeading</code>",
548 * assuming all other default settings.
549 * If this property is <code>false</code>, then the same source would be rendered as "<code>Heading</code>".
550 * <p>
551 * Note that the bottom margin of the last element is never rendered.
552 *
553 * @param includeFirstElementTopMargin specifies whether the top margin of the first element is rendered.
554 * @return this <code>Renderer</code> instance, allowing multiple property setting methods to be chained in a single statement.
555 * @see #getIncludeFirstElementTopMargin()
556 */
557 public Renderer setIncludeFirstElementTopMargin(final boolean includeFirstElementTopMargin) {
558 this.includeFirstElementTopMargin=includeFirstElementTopMargin;
559 return this;
560 }
561
562 /**
563 * Indicates whether the top margin of the first element is rendered.
564 * <p>
565 * See the {@link #setIncludeFirstElementTopMargin(boolean)} method for a full description of this property.
566 *
567 * @return <code>true</code> if the top margin of the first element is rendered, otherwise <code>false</code>.
568 */
569 public boolean getIncludeFirstElementTopMargin() {
570 return includeFirstElementTopMargin;
571 }
572
573 /**
574 * Sets the string that is to separate table cells.
575 * <p>
576 * The default value is <code>" \t"</code> (a space followed by a tab).
577 *
578 * @param tableCellSeparator the string that is to separate table cells.
579 * @return this <code>Renderer</code> instance, allowing multiple property setting methods to be chained in a single statement.
580 * @see #getTableCellSeparator()
581 */
582 public Renderer setTableCellSeparator(final String tableCellSeparator) {
583 this.tableCellSeparator=tableCellSeparator;
584 return this;
585 }
586
587 /**
588 * Returns the string that is to separate table cells.
589 * <p>
590 * See the {@link #setTableCellSeparator(String)} method for a full description of this property.
591 *
592 * @return the string that is to separate table cells.
593 */
594 public String getTableCellSeparator() {
595 return tableCellSeparator;
596 }
597
598 /**
599 * Sets the default top margin of an HTML block element with the specified name.
600 * <p>
601 * The top margin is the number of blank lines that are to be inserted above the rendered block.
602 * <p>
603 * As this is a static method, the setting affects all instances of the <code>Renderer</code> class.
604 * <p>
605 * The <code>htmlElementName</code> argument must be one of the following:<br />
606 * {@link HTMLElementName#ADDRESS ADDRESS},
607 * {@link HTMLElementName#BLOCKQUOTE BLOCKQUOTE},
608 * {@link HTMLElementName#CAPTION CAPTION},
609 * {@link HTMLElementName#CENTER CENTER},
610 * {@link HTMLElementName#DD DD},
611 * {@link HTMLElementName#DIR DIR},
612 * {@link HTMLElementName#DIV DIV},
613 * {@link HTMLElementName#DT DT},
614 * {@link HTMLElementName#FIELDSET FIELDSET},
615 * {@link HTMLElementName#FORM FORM},
616 * {@link HTMLElementName#H1 H1},
617 * {@link HTMLElementName#H2 H2},
618 * {@link HTMLElementName#H3 H3},
619 * {@link HTMLElementName#H4 H4},
620 * {@link HTMLElementName#H5 H5},
621 * {@link HTMLElementName#H6 H6},
622 * {@link HTMLElementName#HR HR},
623 * {@link HTMLElementName#LEGEND LEGEND},
624 * {@link HTMLElementName#LI LI},
625 * {@link HTMLElementName#MENU MENU},
626 * {@link HTMLElementName#OL OL},
627 * {@link HTMLElementName#P P},
628 * {@link HTMLElementName#PRE PRE},
629 * {@link HTMLElementName#TR TR},
630 * {@link HTMLElementName#UL UL}
631 *
632 * @param htmlElementName (required) the case insensitive name of a supported HTML block element.
633 * @param topMargin the new top margin of the specified element.
634 * @throws UnsupportedOperationException if an unsupported element name is specified.
635 */
636 public static void setDefaultTopMargin(String htmlElementName, final int topMargin) {
637 htmlElementName=HTMLElements.getConstantElementName(htmlElementName.toLowerCase());
638 ELEMENT_HANDLERS.put(htmlElementName,getAbstractBlockElementHandler(htmlElementName).newTopMargin(topMargin));
639 }
640
641 /**
642 * Returns the default top margin of an HTML block element with the specified name.
643 * <p>
644 * See the {@link #setDefaultTopMargin(String htmlElementName, int topMargin)} method for a full description of this property.
645 *
646 * @param htmlElementName (required) the case insensitive name of a supported HTML block element.
647 * @return the default top margin of an HTML block element with the specified name.
648 * @throws UnsupportedOperationException if an unsupported element name is specified.
649 */
650 public static int getDefaultTopMargin(final String htmlElementName) {
651 return getAbstractBlockElementHandler(htmlElementName.toLowerCase()).getTopMargin();
652 }
653
654 /**
655 * Sets the default bottom margin of an HTML block element with the specified name.
656 * <p>
657 * The bottom margin is the number of blank lines that are to be inserted below the rendered block.
658 * <p>
659 * As this is a static method, the setting affects all instances of the <code>Renderer</code> class.
660 * <p>
661 * The <code>htmlElementName</code> argument must be one of the following:<br />
662 * {@link HTMLElementName#ADDRESS ADDRESS},
663 * {@link HTMLElementName#BLOCKQUOTE BLOCKQUOTE},
664 * {@link HTMLElementName#CAPTION CAPTION},
665 * {@link HTMLElementName#CENTER CENTER},
666 * {@link HTMLElementName#DD DD},
667 * {@link HTMLElementName#DIR DIR},
668 * {@link HTMLElementName#DIV DIV},
669 * {@link HTMLElementName#DT DT},
670 * {@link HTMLElementName#FIELDSET FIELDSET},
671 * {@link HTMLElementName#FORM FORM},
672 * {@link HTMLElementName#H1 H1},
673 * {@link HTMLElementName#H2 H2},
674 * {@link HTMLElementName#H3 H3},
675 * {@link HTMLElementName#H4 H4},
676 * {@link HTMLElementName#H5 H5},
677 * {@link HTMLElementName#H6 H6},
678 * {@link HTMLElementName#HR HR},
679 * {@link HTMLElementName#LEGEND LEGEND},
680 * {@link HTMLElementName#LI LI},
681 * {@link HTMLElementName#MENU MENU},
682 * {@link HTMLElementName#OL OL},
683 * {@link HTMLElementName#P P},
684 * {@link HTMLElementName#PRE PRE},
685 * {@link HTMLElementName#TR TR},
686 * {@link HTMLElementName#UL UL}
687 *
688 * @param htmlElementName (required) the case insensitive name of a supported HTML block element.
689 * @param bottomMargin the new bottom margin of the specified element.
690 * @throws UnsupportedOperationException if an unsupported element name is specified.
691 */
692 public static void setDefaultBottomMargin(String htmlElementName, final int bottomMargin) {
693 htmlElementName=HTMLElements.getConstantElementName(htmlElementName.toLowerCase());
694 ELEMENT_HANDLERS.put(htmlElementName,getAbstractBlockElementHandler(htmlElementName).newBottomMargin(bottomMargin));
695 }
696
697 /**
698 * Returns the default bottom margin of an HTML block element with the specified name.
699 * <p>
700 * See the {@link #setDefaultBottomMargin(String htmlElementName, int bottomMargin)} method for a full description of this property.
701 *
702 * @param htmlElementName (required) the case insensitive name of a supported HTML block element.
703 * @return the default bottom margin of an HTML block element with the specified name.
704 * @throws UnsupportedOperationException if an unsupported element name is specified.
705 */
706 public static int getDefaultBottomMargin(final String htmlElementName) {
707 return getAbstractBlockElementHandler(htmlElementName.toLowerCase()).getBottomMargin();
708 }
709
710 /**
711 * Sets the default value of whether an HTML block element of the specified name is indented.
712 * <p>
713 * As this is a static method, the setting affects all instances of the <code>Renderer</code> class.
714 * <p>
715 * The <code>htmlElementName</code> argument must be one of the following:<br />
716 * {@link HTMLElementName#ADDRESS ADDRESS},
717 * {@link HTMLElementName#BLOCKQUOTE BLOCKQUOTE},
718 * {@link HTMLElementName#CAPTION CAPTION},
719 * {@link HTMLElementName#CENTER CENTER},
720 * {@link HTMLElementName#DD DD},
721 * {@link HTMLElementName#DIR DIR},
722 * {@link HTMLElementName#DIV DIV},
723 * {@link HTMLElementName#DT DT},
724 * {@link HTMLElementName#FIELDSET FIELDSET},
725 * {@link HTMLElementName#FORM FORM},
726 * {@link HTMLElementName#H1 H1},
727 * {@link HTMLElementName#H2 H2},
728 * {@link HTMLElementName#H3 H3},
729 * {@link HTMLElementName#H4 H4},
730 * {@link HTMLElementName#H5 H5},
731 * {@link HTMLElementName#H6 H6},
732 * {@link HTMLElementName#HR HR},
733 * {@link HTMLElementName#LEGEND LEGEND},
734 * {@link HTMLElementName#MENU MENU},
735 * {@link HTMLElementName#OL OL},
736 * {@link HTMLElementName#P P},
737 * {@link HTMLElementName#PRE PRE},
738 * {@link HTMLElementName#TR TR},
739 * {@link HTMLElementName#UL UL}
740 *
741 * @param htmlElementName (required) the case insensitive name of a supported HTML block element.
742 * @param indent whether the the specified element is indented.
743 * @throws UnsupportedOperationException if an unsupported element name is specified.
744 */
745 public static void setDefaultIndent(String htmlElementName, final boolean indent) {
746 htmlElementName=HTMLElements.getConstantElementName(htmlElementName.toLowerCase());
747 if (htmlElementName==HTMLElementName.LI) throw new UnsupportedOperationException();
748 ELEMENT_HANDLERS.put(htmlElementName,getAbstractBlockElementHandler(htmlElementName).newIndent(indent));
749 }
750
751 /**
752 * Returns the default value of whether an HTML block element of the specified name is indented.
753 * <p>
754 * See the {@link #setDefaultIndent(String htmlElementName, boolean indent)} method for a full description of this property.
755 *
756 * @param htmlElementName (required) the case insensitive name of a supported HTML block element.
757 * @return the default value of whether an HTML block element of the specified name is indented.
758 * @throws UnsupportedOperationException if an unsupported element name is specified.
759 */
760 public static boolean isDefaultIndent(String htmlElementName) {
761 htmlElementName=HTMLElements.getConstantElementName(htmlElementName.toLowerCase());
762 if (htmlElementName==HTMLElementName.LI) throw new UnsupportedOperationException();
763 return getAbstractBlockElementHandler(htmlElementName.toLowerCase()).isIndent();
764 }
765
766 private static AbstractBlockElementHandler getAbstractBlockElementHandler(String htmlElementName) {
767 ElementHandler elementHandler=ELEMENT_HANDLERS.get(htmlElementName);
768 if (elementHandler==null || !(elementHandler instanceof AbstractBlockElementHandler)) throw new UnsupportedOperationException("Cannot set block properties on element "+htmlElementName);
769 return (AbstractBlockElementHandler)elementHandler;
770 }
771
772 /** This class does the actual work, but is first passed final copies of all the parameters for efficiency. */
773 private static final class Processor {
774 private final Renderer renderer;
775 private final Segment rootSegment;
776 private final Source source;
777 private final int maxLineLength;
778 private final String newLine;
779 private final boolean includeHyperlinkURLs;
780 private final boolean includeAlternateText;
781 private final boolean decorateFontStyles;
782 private final boolean convertNonBreakingSpaces;
783 private final int blockIndentSize;
784 private final int listIndentSize;
785 private final char[] listBullets;
786 private final String tableCellSeparator;
787
788 private Appendable appendable;
789 private int renderedIndex; // keeps track of where rendering is up to in case of overlapping elements
790 private boolean atStartOfLine;
791 private boolean skipInitialNewLines;
792 private int col;
793 private int listIndentLevel;
794 private int indentSize;
795 private int blockVerticalMargin; // minimum number of blank lines to output at the current block boundary, or NO_MARGIN (-1) if we are not currently at a block boundary.
796 private boolean preformatted;
797 private boolean lastCharWhiteSpace;
798 private final boolean ignoreInitialWhiteSpace=false; // can remove this at some stage once we're sure it won't be used.
799 private boolean bullet;
800 private int listBulletNumber;
801
802 private static final int NO_MARGIN=-1;
803
804 public Processor(final Renderer renderer, final Segment rootSegment, final int maxLineLength, final String newLine, final boolean includeHyperlinkURLs, final boolean includeAlternateText, final boolean decorateFontStyles, final boolean convertNonBreakingSpaces, final int blockIndentSize, final int listIndentSize, final char[] listBullets, final String tableCellSeparator) {
805 this.renderer=renderer;
806 this.rootSegment=rootSegment;
807 source=rootSegment.source;
808 this.maxLineLength=maxLineLength;
809 this.newLine=newLine;
810 this.includeHyperlinkURLs=includeHyperlinkURLs;
811 this.includeAlternateText=includeAlternateText;
812 this.decorateFontStyles=decorateFontStyles;
813 this.convertNonBreakingSpaces=convertNonBreakingSpaces;
814 this.blockIndentSize=blockIndentSize;
815 this.listIndentSize=listIndentSize;
816 this.listBullets=listBullets;
817 this.tableCellSeparator=tableCellSeparator;
818 }
819
820 public void appendTo(final Appendable appendable) throws IOException {
821 reset();
822 this.appendable=appendable;
823 appendSegmentProcessingChildElements(rootSegment.begin,rootSegment.end,rootSegment.getChildElements());
824 }
825
826 private void reset() {
827 renderedIndex=0;
828 atStartOfLine=true;
829 skipInitialNewLines=!renderer.includeFirstElementTopMargin;
830 col=0;
831 listIndentLevel=0;
832 indentSize=0;
833 blockVerticalMargin=NO_MARGIN;
834 preformatted=false;
835 lastCharWhiteSpace=false;
836 //ignoreInitialWhiteSpace=false;
837 bullet=false;
838 }
839
840 private void appendElementContent(final Element element) throws IOException {
841 final int contentEnd=element.getContentEnd();
842 if (element.isEmpty() || renderedIndex>=contentEnd) return;
843 final int contentBegin=element.getStartTag().end;
844 appendSegmentProcessingChildElements(Math.max(renderedIndex,contentBegin),contentEnd,element.getChildElements());
845 }
846
847 private void appendSegmentProcessingChildElements(final int begin, final int end, final List<Element> childElements) throws IOException {
848 int index=begin;
849 for (Element childElement : childElements) {
850 if (index>=childElement.end) continue;
851 if (index<childElement.begin) appendSegmentRemovingTags(index,childElement.begin);
852 getElementHandler(childElement).process(this,childElement);
853 index=Math.max(renderedIndex,childElement.end);
854 }
855 if (index<end) appendSegmentRemovingTags(index,end);
856 }
857
858 private static ElementHandler getElementHandler(final Element element) {
859 if (element.getStartTag().getStartTagType().isServerTag()) return RemoveElementHandler.INSTANCE; // hard-coded configuration does not include server tags in child element hierarchy, so this is normally not executed.
860 ElementHandler elementHandler=ELEMENT_HANDLERS.get(element.getName());
861 return (elementHandler!=null) ? elementHandler : StandardInlineElementHandler.INSTANCE;
862 }
863
864 private void appendSegmentRemovingTags(final int begin, final int end) throws IOException {
865 int index=begin;
866 while (true) {
867 Tag tag=source.getNextTag(index);
868 if (tag==null || tag.begin>=end) break;
869 appendSegment(index,tag.begin);
870 index=tag.end;
871 }
872 appendSegment(index,end);
873 }
874
875 private void appendSegment(int begin, final int end) throws IOException {
876 assert begin<=end;
877 if (begin<renderedIndex) begin=renderedIndex;
878 if (begin>=end) return;
879 try {
880 if (preformatted)
881 appendPreformattedSegment(begin,end);
882 else
883 appendNonPreformattedSegment(begin,end);
884 } finally {
885 if (renderedIndex<end) renderedIndex=end;
886 }
887 }
888
889 private void appendPreformattedSegment(final int begin, final int end) throws IOException {
890 assert begin<end;
891 assert begin>=renderedIndex;
892 if (isBlockBoundary()) appendBlockVerticalMargin();
893 final String text=CharacterReference.decode(source.subSequence(begin,end),false,convertNonBreakingSpaces);
894 for (int i=0; i<text.length(); i++) {
895 final char ch=text.charAt(i);
896 if (ch=='\n') {
897 newLine();
898 } else if (ch=='\r') {
899 newLine();
900 final int nextI=i+1;
901 if (nextI==text.length()) break;
902 if (text.charAt(nextI)=='\n') i++;
903 } else {
904 append(ch);
905 }
906 }
907 }
908
909 private void appendNonPreformattedSegment(final int begin, final int end) throws IOException {
910 assert begin<end;
911 assert begin>=renderedIndex;
912 final String text=CharacterReference.decodeCollapseWhiteSpace(source.subSequence(begin,end),convertNonBreakingSpaces);
913 if (text.length()==0) {
914 // collapsed text is zero length but original segment wasn't, meaning it consists purely of white space.
915 if (!ignoreInitialWhiteSpace) lastCharWhiteSpace=true;
916 return;
917 }
918 appendNonPreformattedText(text,Segment.isWhiteSpace(source.charAt(begin)),Segment.isWhiteSpace(source.charAt(end-1)));
919 }
920
921 private void appendText(final String text) throws IOException {
922 assert text.length()>0;
923 appendNonPreformattedText(text,Segment.isWhiteSpace(text.charAt(0)),Segment.isWhiteSpace(text.charAt(text.length()-1)));
924 }
925
926 private void appendNonPreformattedText(final String text, final boolean isWhiteSpaceAtStart, final boolean isWhiteSpaceAtEnd) throws IOException {
927 if (isBlockBoundary()) {
928 appendBlockVerticalMargin();
929 } else if (lastCharWhiteSpace || (isWhiteSpaceAtStart && !ignoreInitialWhiteSpace)) {
930 // output white space only if not on a block boundary
931 append(' ');
932 }
933 int textIndex=0;
934 int i=0;
935 lastCharWhiteSpace=false;
936 //ignoreInitialWhiteSpace=false;
937 while (true) {
938 for (; i<text.length(); i++) {
939 if (text.charAt(i)!=' ') continue; // search for end of word
940 // At end of word. To comply with RFC264 Format=Flowed protocol, need to make sure we don't wrap immediately before ">" or "From ".
941 if (i+1<text.length() && text.charAt(i+1)=='>') continue;
942 if (i+6<text.length() && text.startsWith("From ",i+1)) continue;
943 break; // OK to wrap here if necessary
944 }
945 if (col+i-textIndex+1>=maxLineLength) {
946 if (lastCharWhiteSpace && (listIndentLevel|indentSize)==0) append(' ');
947 startNewLine(0);
948 } else if (lastCharWhiteSpace) {
949 append(' ');
950 }
951 append(text,textIndex,i);
952 if (i==text.length()) break;
953 lastCharWhiteSpace=true;
954 textIndex=++i;
955 }
956 lastCharWhiteSpace=isWhiteSpaceAtEnd;
957 }
958
959 private boolean isBlockBoundary() {
960 return blockVerticalMargin!=NO_MARGIN;
961 }
962
963 private void appendBlockVerticalMargin() throws IOException {
964 assert blockVerticalMargin!=NO_MARGIN;
965 if (skipInitialNewLines) {
966 // at first text after <li> element or start of document
967 skipInitialNewLines=false;
968 final int indentCol=indentSize+listIndentLevel*listIndentSize;
969 if (col==indentCol) {
970 atStartOfLine=false; // no need to call appendIndent() from appendTextInit().
971 } else {
972 // there was an indenting block since the <li> or start of document
973 if (bullet || col>indentCol) {
974 // just start new line as normal if the last indenting block is another <li>, or if the current column is already past the required indent
975 startNewLine(0);
976 } else {
977 // just append spaces to get the column up to the required indent
978 while (indentCol>col) {
979 appendable.append(' ');
980 col++;
981 }
982 atStartOfLine=false; // make sure appendIndent() isn't called again from appendTextInit()
983 }
984 }
985 } else {
986 startNewLine(blockVerticalMargin);
987 }
988 blockVerticalMargin=NO_MARGIN;
989 }
990
991 private void blockBoundary(final int verticalMargin) throws IOException {
992 // Set a block boundary with the given vertical margin. The vertical margin is the minimum number of blank lines to output between the blocks.
993 // This method can be called multiple times at a block boundary, and the next textual output will output the number of blank lines determined by the
994 // maximum vertical margin of all the method calls.
995 if (blockVerticalMargin<verticalMargin) blockVerticalMargin=verticalMargin;
996 }
997
998 private void startNewLine(int verticalMargin) throws IOException {
999 // ensures we end up at the start of a line with the specified vertical margin between the previous textual output and the next textual output.
1000 final int requiredNewLines=verticalMargin+(atStartOfLine?0:1);
1001 for (int i=0; i<requiredNewLines; i++) appendable.append(newLine);
1002 atStartOfLine=true;
1003 col=0;
1004 }
1005
1006 private void newLine() throws IOException {
1007 appendable.append(newLine);
1008 atStartOfLine=true;
1009 col=0;
1010 }
1011
1012 private void appendTextInit() throws IOException {
1013 skipInitialNewLines=false;
1014 if (atStartOfLine) appendIndent();
1015 }
1016
1017 private void appendIndent() throws IOException {
1018 for (int i=indentSize; i>0; i--) appendable.append(' ');
1019 if (bullet) {
1020 for (int i=(listIndentLevel-1)*listIndentSize; i>0; i--) appendable.append(' ');
1021 if (listBulletNumber==UNORDERED_LIST) {
1022 for (int i=listIndentSize-2; i>0; i--) appendable.append(' ');
1023 appendable.append(listBullets[(listIndentLevel-1)%listBullets.length]).append(' ');
1024 } else {
1025 String bulletNumberString=Integer.toString(listBulletNumber);
1026 for (int i=listIndentSize-bulletNumberString.length()-2; i>0; i--) appendable.append(' ');
1027 appendable.append(bulletNumberString).append(". ");
1028 }
1029 bullet=false;
1030 } else {
1031 for (int i=listIndentLevel*listIndentSize; i>0; i--) appendable.append(' ');
1032 }
1033 col=indentSize+listIndentLevel*listIndentSize;
1034 atStartOfLine=false;
1035 }
1036
1037 private Processor append(final char ch) throws IOException {
1038 appendTextInit();
1039 appendable.append(ch);
1040 col++;
1041 return this;
1042 }
1043
1044 private Processor append(final String text) throws IOException {
1045 appendTextInit();
1046 appendable.append(text);
1047 col+=text.length();
1048 return this;
1049 }
1050
1051 private void append(final CharSequence text, final int begin, final int end) throws IOException {
1052 appendTextInit();
1053 for (int i=begin; i<end; i++) appendable.append(text.charAt(i));
1054 col+=end-begin;
1055 }
1056 }
1057
1058 private interface ElementHandler {
1059 void process(Processor x, Element element) throws IOException;
1060 }
1061
1062 private static final class RemoveElementHandler implements ElementHandler {
1063 public static final ElementHandler INSTANCE=new RemoveElementHandler();
1064 public void process(Processor x, Element element) {}
1065 }
1066
1067 private static final class StandardInlineElementHandler implements ElementHandler {
1068 public static final ElementHandler INSTANCE=new StandardInlineElementHandler();
1069 public void process(Processor x, Element element) throws IOException {
1070 x.appendElementContent(element);
1071 }
1072 }
1073
1074 private static final class FontStyleElementHandler implements ElementHandler {
1075 public static final ElementHandler INSTANCE_B=new FontStyleElementHandler('*');
1076 public static final ElementHandler INSTANCE_I=new FontStyleElementHandler('/');
1077 public static final ElementHandler INSTANCE_U=new FontStyleElementHandler('_');
1078 public static final ElementHandler INSTANCE_CODE=new FontStyleElementHandler('|');
1079 private final char decorationChar;
1080 public FontStyleElementHandler(char decorationChar) {
1081 this.decorationChar=decorationChar;
1082 }
1083 public void process(Processor x, Element element) throws IOException {
1084 if (x.decorateFontStyles) {
1085 if (x.lastCharWhiteSpace) {
1086 x.append(' ');
1087 x.lastCharWhiteSpace=false;
1088 }
1089 x.append(decorationChar);
1090 x.appendElementContent(element);
1091 if (x.decorateFontStyles) x.append(decorationChar);
1092 } else {
1093 x.appendElementContent(element);
1094 }
1095 }
1096 }
1097
1098 abstract private static class AbstractBlockElementHandler implements ElementHandler {
1099 private final int topMargin;
1100 private final int bottomMargin;
1101 private final boolean indent;
1102 protected AbstractBlockElementHandler(int topMargin, int bottomMargin, boolean indent) {
1103 this.topMargin=topMargin;
1104 this.bottomMargin=bottomMargin;
1105 this.indent=indent;
1106 }
1107 public void process(Processor x, Element element) throws IOException {
1108 x.blockBoundary(RendererCSS.getTopMargin(element,topMargin));
1109 int leftMargin=RendererCSS.getLeftMargin(element,indent ? x.blockIndentSize : 0);
1110 x.indentSize+=leftMargin;
1111 processBlockContent(x,element);
1112 x.indentSize-=leftMargin;
1113 x.blockBoundary(RendererCSS.getBottomMargin(element,bottomMargin));
1114 }
1115 public AbstractBlockElementHandler newTopMargin(int topMargin) {
1116 return newInstance(topMargin,this.bottomMargin,this.indent);
1117 }
1118 public int getTopMargin() {
1119 return topMargin;
1120 }
1121 public AbstractBlockElementHandler newBottomMargin(int bottomMargin) {
1122 return newInstance(this.topMargin,bottomMargin,this.indent);
1123 }
1124 public int getBottomMargin() {
1125 return bottomMargin;
1126 }
1127 public AbstractBlockElementHandler newIndent(boolean indent) {
1128 return newInstance(this.topMargin,this.bottomMargin,indent);
1129 }
1130 public boolean isIndent() {
1131 return indent;
1132 }
1133 abstract protected void processBlockContent(Processor x, Element element) throws IOException;
1134 abstract protected AbstractBlockElementHandler newInstance(int topMargin, int bottomMargin, boolean indent);
1135 }
1136
1137 private static final class StandardBlockElementHandler extends AbstractBlockElementHandler {
1138 public static final ElementHandler INSTANCE_0_0=new StandardBlockElementHandler(0,0,false);
1139 public static final ElementHandler INSTANCE_1_1=new StandardBlockElementHandler(1,1,false);
1140 public static final ElementHandler INSTANCE_2_1=new StandardBlockElementHandler(2,1,false);
1141 public static final ElementHandler INSTANCE_0_0_INDENT=new StandardBlockElementHandler(0,0,true);
1142 public static final ElementHandler INSTANCE_1_1_INDENT=new StandardBlockElementHandler(1,1,true);
1143 private StandardBlockElementHandler(int topMargin, int bottomMargin, boolean indent) {
1144 super(topMargin,bottomMargin,indent);
1145 }
1146 protected void processBlockContent(Processor x, Element element) throws IOException {
1147 x.appendElementContent(element);
1148 }
1149 protected AbstractBlockElementHandler newInstance(int topMargin, int bottomMargin, boolean indent) {
1150 return new StandardBlockElementHandler(topMargin,bottomMargin,indent);
1151 }
1152 }
1153
1154 private static final class A_ElementHandler implements ElementHandler {
1155 public static final ElementHandler INSTANCE=new A_ElementHandler();
1156 public void process(Processor x, Element element) throws IOException {
1157 x.appendElementContent(element);
1158 if (!x.includeHyperlinkURLs) return;
1159 String renderedHyperlinkURL=x.renderer.renderHyperlinkURL(element.getStartTag());
1160 if (renderedHyperlinkURL==null) return;
1161 int linkLength=renderedHyperlinkURL.length()+1;
1162 if (x.col+linkLength>=x.maxLineLength) {
1163 x.startNewLine(0);
1164 } else {
1165 x.append(' ');
1166 }
1167 x.append(renderedHyperlinkURL);
1168 x.lastCharWhiteSpace=true;
1169 }
1170 }
1171
1172 private static final class BR_ElementHandler implements ElementHandler {
1173 public static final ElementHandler INSTANCE=new BR_ElementHandler();
1174 public void process(Processor x, Element element) throws IOException {
1175 if (x.isBlockBoundary() && !x.atStartOfLine && !x.skipInitialNewLines) x.newLine(); // add an extra new line if we're at a block boundary and aren't already at the start of the next line and it's not the first element after <li>
1176 x.newLine();
1177 x.blockBoundary(0);
1178 }
1179 }
1180
1181 private static final class HR_ElementHandler extends AbstractBlockElementHandler {
1182 public static final ElementHandler INSTANCE=new HR_ElementHandler();
1183 private HR_ElementHandler() {
1184 this(0,0,false);
1185 }
1186 private HR_ElementHandler(int topMargin, int bottomMargin, boolean indent) {
1187 super(topMargin,bottomMargin,indent);
1188 }
1189 protected void processBlockContent(Processor x, Element element) throws IOException {
1190 x.appendBlockVerticalMargin();
1191 final int maxCol=x.maxLineLength-4;
1192 x.append('-');
1193 for (int i=x.col; i<maxCol; i++) x.appendable.append('-');
1194 x.col=maxCol;
1195 }
1196 protected AbstractBlockElementHandler newInstance(int topMargin, int bottomMargin, boolean indent) {
1197 return new HR_ElementHandler(topMargin,bottomMargin,indent);
1198 }
1199 }
1200
1201 private static final class AlternateTextElementHandler implements ElementHandler {
1202 public static final ElementHandler INSTANCE=new AlternateTextElementHandler();
1203 public void process(Processor x, Element element) throws IOException {
1204 if (!x.includeAlternateText) return;
1205 String text=x.renderer.renderAlternateText(element.getStartTag());
1206 if (text==null) return;
1207 x.appendText(text);
1208 }
1209 }
1210
1211 private static final class ListElementHandler extends AbstractBlockElementHandler {
1212 public static final ElementHandler INSTANCE_OL=new ListElementHandler(0);
1213 public static final ElementHandler INSTANCE_UL=new ListElementHandler(UNORDERED_LIST);
1214 private final int initialListBulletNumber;
1215 private ListElementHandler(int initialListBulletNumber) {
1216 this(initialListBulletNumber,0,0,false);
1217 }
1218 private ListElementHandler(int initialListBulletNumber, int topMargin, int bottomMargin, boolean indent) {
1219 super(topMargin,bottomMargin,indent);
1220 this.initialListBulletNumber=initialListBulletNumber;
1221 }
1222 protected void processBlockContent(Processor x, Element element) throws IOException {
1223 int oldListBulletNumber=x.listBulletNumber;
1224 x.listBulletNumber=initialListBulletNumber;
1225 x.listIndentLevel++;
1226 x.appendElementContent(element);
1227 x.listIndentLevel--;
1228 x.listBulletNumber=oldListBulletNumber;
1229 }
1230 protected AbstractBlockElementHandler newInstance(int topMargin, int bottomMargin, boolean indent) {
1231 return new ListElementHandler(initialListBulletNumber,topMargin,bottomMargin,indent);
1232 }
1233 }
1234
1235 private static final class LI_ElementHandler extends AbstractBlockElementHandler {
1236 public static final ElementHandler INSTANCE=new LI_ElementHandler();
1237 private LI_ElementHandler() {
1238 this(0,0,false);
1239 }
1240 private LI_ElementHandler(int topMargin, int bottomMargin, boolean indent) {
1241 super(topMargin,bottomMargin,indent);
1242 }
1243 protected void processBlockContent(Processor x, Element element) throws IOException {
1244 if (x.listBulletNumber!=UNORDERED_LIST) x.listBulletNumber++;
1245 x.bullet=true;
1246 x.appendBlockVerticalMargin();
1247 x.appendIndent();
1248 x.skipInitialNewLines=true;
1249 x.blockBoundary(0); // this shouldn't result in the output of any new lines but ensures surrounding white space is ignored
1250 x.appendElementContent(element);
1251 x.bullet=false;
1252 }
1253 protected AbstractBlockElementHandler newInstance(int topMargin, int bottomMargin, boolean indent) {
1254 return new LI_ElementHandler(topMargin,bottomMargin,indent);
1255 }
1256 }
1257
1258 private static final class PRE_ElementHandler extends AbstractBlockElementHandler {
1259 public static final ElementHandler INSTANCE=new PRE_ElementHandler();
1260 private PRE_ElementHandler() {
1261 this(1,1,false);
1262 }
1263 private PRE_ElementHandler(int topMargin, int bottomMargin, boolean indent) {
1264 super(topMargin,bottomMargin,indent);
1265 }
1266 protected void processBlockContent(Processor x, Element element) throws IOException {
1267 boolean oldPreformatted=x.preformatted; // should always be false
1268 x.preformatted=true;
1269 x.appendElementContent(element);
1270 x.preformatted=oldPreformatted;
1271 }
1272 protected AbstractBlockElementHandler newInstance(int topMargin, int bottomMargin, boolean indent) {
1273 return new PRE_ElementHandler(topMargin,bottomMargin,indent);
1274 }
1275 }
1276
1277 private static final class TD_ElementHandler implements ElementHandler {
1278 public static final ElementHandler INSTANCE=new TD_ElementHandler();
1279 public void process(Processor x, Element element) throws IOException {
1280 if (!x.isBlockBoundary()) x.append(x.tableCellSeparator);
1281 x.lastCharWhiteSpace=false;
1282 x.appendElementContent(element);
1283 }
1284 }
1285
1286 }

   
Visit the aagtl Website