/[aagtl_public1]/src/net/htmlparser/jericho/SourceFormatter.java
aagtl

Contents of /src/net/htmlparser/jericho/SourceFormatter.java

Parent Directory Parent Directory | Revision Log Revision Log


Revision 2 - (show annotations) (download)
Sun Aug 5 13:48:36 2012 UTC (11 years, 7 months ago) by zoffadmin
File size: 33533 byte(s)
initial import of aagtl source code
1 // Jericho HTML Parser - Java based library for analysing and manipulating HTML
2 // Version 3.2
3 // Copyright (C) 2004-2009 Martin Jericho
4 // http://jericho.htmlparser.net/
5 //
6 // This library is free software; you can redistribute it and/or
7 // modify it under the terms of either one of the following licences:
8 //
9 // 1. The Eclipse Public License (EPL) version 1.0,
10 // included in this distribution in the file licence-epl-1.0.html
11 // or available at http://www.eclipse.org/legal/epl-v10.html
12 //
13 // 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
14 // included in this distribution in the file licence-lgpl-2.1.txt
15 // or available at http://www.gnu.org/licenses/lgpl.txt
16 //
17 // This library is distributed on an "AS IS" basis,
18 // WITHOUT WARRANTY OF ANY KIND, either express or implied.
19 // See the individual licence texts for more details.
20
21 package net.htmlparser.jericho;
22
23 import java.util.*;
24 import java.io.*;
25 import java.net.*;
26
27 /**
28 * Formats HTML source by laying out each non-inline-level element on a new line with an appropriate indent.
29 * <p>
30 * Any indentation present in the original source text is removed.
31 * <p>
32 * Use one of the following methods to obtain the output:
33 * <ul>
34 * <li>{@link #writeTo(Writer)}</li>
35 * <li>{@link #appendTo(Appendable)}</li>
36 * <li>{@link #toString()}</li>
37 * <li>{@link CharStreamSourceUtil#getReader(CharStreamSource) CharStreamSourceUtil.getReader(this)}</li>
38 * </ul>
39 * <p>
40 * The output text is functionally equivalent to the original source and should be rendered identically unless specified below.
41 * <p>
42 * The following points describe the process in general terms.
43 * Any aspect of the algorithm not specifically mentioned here is subject to change without notice in future versions.
44 * <p>
45 * <ul>
46 * <li>Every element that is not an {@linkplain HTMLElements#getInlineLevelElementNames() inline-level element} appears on a new line
47 * with an indent corresponding to its {@linkplain Element#getDepth() depth} in the <a href="Source.html#DocumentElementHierarchy">document element hierarchy</a>.
48 * <li>The indent is formed by writing <i>n</i> repetitions of the string specified in the {@link #setIndentString(String) IndentString} property,
49 * where <i>n</i> is the depth of the indentation.
50 * <li>The {@linkplain Element#getContent() content} of an indented element starts on a new line and is indented at a depth one greater than that of the element,
51 * with the end tag appearing on a new line at the same depth as the start tag.
52 * If the content contains only text and {@linkplain HTMLElements#getInlineLevelElementNames() inline-level elements},
53 * it may continue on the same line as the start tag. Additionally, if the output content contains no new lines, the end tag may also continue on the same line.
54 * <li>The content of preformatted elements such as {@link HTMLElementName#PRE PRE} and {@link HTMLElementName#TEXTAREA TEXTAREA} are not indented,
55 * nor is the white space modified in any way.
56 * <li>Only {@linkplain StartTagType#NORMAL normal} and {@linkplain StartTagType#DOCTYPE_DECLARATION document type declaration} elements are indented.
57 * All others are treated as {@linkplain HTMLElements#getInlineLevelElementNames() inline-level elements}.
58 * <li>White space and indentation inside HTML {@linkplain StartTagType#COMMENT comments}, {@linkplain StartTagType#CDATA_SECTION CDATA sections}, or any
59 * {@linkplain TagType#isServerTag() server tag} is preserved,
60 * but with the indentation of new lines starting at a depth one greater than that of the surrounding text.
61 * <li>White space and indentation inside {@link HTMLElementName#SCRIPT SCRIPT} elements is preserved,
62 * but with the indentation of new lines starting at a depth one greater than that of the <code>SCRIPT</code> element.
63 * <li>If the {@link #setTidyTags(boolean) TidyTags} property is set to <code>true</code>,
64 * every tag in the document is replaced with the output from its {@link Tag#tidy()} method.
65 * If this property is set to <code>false</code>, the tag from the original text is used, including all white space,
66 * but with any new lines indented at a depth one greater than that of the element.
67 * <li>If the {@link #setCollapseWhiteSpace(boolean) CollapseWhiteSpace} property
68 * is set to <code>true</code>, every string of one or more {@linkplain Segment#isWhiteSpace(char) white space} characters
69 * located outside of a tag is replaced with a single space in the output.
70 * White space located adjacent to a non-inline-level element tag (except {@linkplain TagType#isServerTag() server tags}) may be removed.
71 * <li>If the {@link #setIndentAllElements(boolean) IndentAllElements} property
72 * is set to <code>true</code>, every element appears indented on a new line, including {@linkplain HTMLElements#getInlineLevelElementNames() inline-level elements}.
73 * This generates output that is a good representation of the actual <a href="Source.html#DocumentElementHierarchy">document element hierarchy</a>,
74 * but is very likely to introduce white space that compromises the functional equivalency of the document.
75 * <li>The {@link #setNewLine(String) NewLine} property specifies the character sequence
76 * to use for each <a target="_blank" href="http://en.wikipedia.org/wiki/Newline">newline</a> in the output document.
77 * <li>If the source document contains {@linkplain TagType#isServerTag() server tags}, the functional equivalency of the output document may be compromised.
78 * </ul>
79 * <p>
80 * Formatting an entire {@link Source} object performs a {@linkplain Source#fullSequentialParse() full sequential parse} automatically.
81 */
82 public final class SourceFormatter implements CharStreamSource {
83 private final Segment segment;
84 private String indentString="\t";
85 private boolean tidyTags=false;
86 private boolean collapseWhiteSpace=false;
87 private boolean removeLineBreaks=false;
88 private boolean indentAllElements=false;
89 private String newLine=null;
90
91 /**
92 * Constructs a new <code>SourceFormatter</code> based on the specified {@link Segment}.
93 * @param segment the segment containing the HTML to be formatted.
94 * @see Source#getSourceFormatter()
95 */
96 public SourceFormatter(final Segment segment) {
97 this.segment=segment;
98 }
99
100 // Documentation inherited from CharStreamSource
101 public void writeTo(final Writer writer) throws IOException {
102 appendTo(writer);
103 writer.flush();
104 }
105
106 // Documentation inherited from CharStreamSource
107 public void appendTo(final Appendable appendable) throws IOException {
108 new Processor(segment,getIndentString(),getTidyTags(),getCollapseWhiteSpace(),getRemoveLineBreaks(),getIndentAllElements(),getIndentAllElements(),getNewLine()).appendTo(appendable);
109 }
110
111 // Documentation inherited from CharStreamSource
112 public long getEstimatedMaximumOutputLength() {
113 return segment.length()*2;
114 }
115
116 // Documentation inherited from CharStreamSource
117 public String toString() {
118 return CharStreamSourceUtil.toString(this);
119 }
120
121 /**
122 * Sets the string to be used for indentation.
123 * <p>
124 * The default value is a string containing a single tab character (U+0009).
125 * <p>
126 * The most commonly used indent strings are <code>"\t"</code> (single tab), <code>"&nbsp;"</code> (single space), <code>"&nbsp;&nbsp;"</code> (2 spaces), and <code>"&nbsp;&nbsp;&nbsp;&nbsp;"</code> (4 spaces).
127 *
128 * @param indentString the string to be used for indentation, must not be <code>null</code>.
129 * @return this <code>SourceFormatter</code> instance, allowing multiple property setting methods to be chained in a single statement.
130 * @see #getIndentString()
131 */
132 public SourceFormatter setIndentString(final String indentString) {
133 if (indentString==null) throw new IllegalArgumentException("indentString property must not be null");
134 this.indentString=indentString;
135 return this;
136 }
137
138 /**
139 * Returns the string to be used for indentation.
140 * <p>
141 * See the {@link #setIndentString(String)} method for a full description of this property.
142 *
143 * @return the string to be used for indentation.
144 */
145 public String getIndentString() {
146 return indentString;
147 }
148
149 /**
150 * Sets whether the original text of each tag is to be replaced with the output from its {@link Tag#tidy()} method.
151 * <p>
152 * The default value is <code>false</code>.
153 * <p>
154 * If this property is set to <code>false</code>, the tag from the original text is used, including all white space,
155 * but with any new lines indented at a depth one greater than that of the element.
156 *
157 * @param tidyTags specifies whether the original text of each tag is to be replaced with the output from its {@link Tag#tidy()} method.
158 * @return this <code>SourceFormatter</code> instance, allowing multiple property setting methods to be chained in a single statement.
159 * @see #getTidyTags()
160 */
161 public SourceFormatter setTidyTags(final boolean tidyTags) {
162 this.tidyTags=tidyTags;
163 return this;
164 }
165
166 /**
167 * Indicates whether the original text of each tag is to be replaced with the output from its {@link Tag#tidy()} method.
168 * <p>
169 * See the {@link #setTidyTags(boolean)} method for a full description of this property.
170 *
171 * @return <code>true</code> if the original text of each tag is to be replaced with the output from its {@link Tag#tidy()} method, otherwise <code>false</code>.
172 */
173 public boolean getTidyTags() {
174 return tidyTags;
175 }
176
177 /**
178 * Sets whether {@linkplain Segment#isWhiteSpace(char) white space} in the text between the tags is to be collapsed.
179 * <p>
180 * The default value is <code>false</code>.
181 * <p>
182 * If this property is set to <code>true</code>, every string of one or more {@linkplain Segment#isWhiteSpace(char) white space} characters
183 * located outside of a tag is replaced with a single space in the output.
184 * White space located adjacent to a non-inline-level element tag (except {@linkplain TagType#isServerTag() server tags}) may be removed.
185 *
186 * @param collapseWhiteSpace specifies whether {@linkplain Segment#isWhiteSpace(char) white space} in the text between the tags is to be collapsed.
187 * @return this <code>SourceFormatter</code> instance, allowing multiple property setting methods to be chained in a single statement.
188 * @see #getCollapseWhiteSpace()
189 */
190 public SourceFormatter setCollapseWhiteSpace(final boolean collapseWhiteSpace) {
191 this.collapseWhiteSpace=collapseWhiteSpace;
192 return this;
193 }
194
195 /**
196 * Indicates whether {@linkplain Segment#isWhiteSpace(char) white space} in the text between the tags is to be collapsed.
197 * <p>
198 * See the {@link #setCollapseWhiteSpace(boolean collapseWhiteSpace)} method for a full description of this property.
199 *
200 * @return <code>true</code> if {@linkplain Segment#isWhiteSpace(char) white space} in the text between the tags is to be collapsed, otherwise <code>false</code>.
201 */
202 public boolean getCollapseWhiteSpace() {
203 return collapseWhiteSpace;
204 }
205
206 /**
207 * Sets whether all non-essential line breaks are removed.
208 * <p>
209 * The default value is <code>false</code>.
210 * <p>
211 * If this property is set to <code>true</code>, only essential line breaks are retained in the output.
212 * <p>
213 * Setting this property automatically engages the {@link #setCollapseWhiteSpace(boolean) CollapseWhiteSpace} option, regardless of its property setting.
214 * <p>
215 * It is recommended to set the {@link #setTidyTags(boolean) TidyTags} property when this option is used so that non-essential line breaks are also removed from tags.
216 *
217 * @param removeLineBreaks specifies whether all non-essential line breaks are removed.
218 * @return this <code>SourceFormatter</code> instance, allowing multiple property setting methods to be chained in a single statement.
219 * @see #getRemoveLineBreaks()
220 */
221 SourceFormatter setRemoveLineBreaks(final boolean removeLineBreaks) {
222 this.removeLineBreaks=removeLineBreaks;
223 return this;
224 }
225
226 /**
227 * Indicates whether all non-essential line breaks are removed.
228 * <p>
229 * See the {@link #setRemoveLineBreaks(boolean removeLineBreaks)} method for a full description of this property.
230 *
231 * @return <code>true</code> if all non-essential line breaks are removed, otherwise <code>false</code>.
232 */
233 boolean getRemoveLineBreaks() {
234 return removeLineBreaks;
235 }
236
237 /**
238 * Sets whether all elements are to be indented, including {@linkplain HTMLElements#getInlineLevelElementNames() inline-level elements} and those with preformatted contents.
239 * <p>
240 * The default value is <code>false</code>.
241 * <p>
242 * If this property is set to <code>true</code>, every element appears indented on a new line, including
243 * {@linkplain HTMLElements#getInlineLevelElementNames() inline-level elements}.
244 * <p>
245 * This generates output that is a good representation of the actual <a href="Source.html#DocumentElementHierarchy">document element hierarchy</a>,
246 * but is very likely to introduce white space that compromises the functional equivalency of the document.
247 *
248 * @param indentAllElements specifies whether all elements are to be indented.
249 * @return this <code>SourceFormatter</code> instance, allowing multiple property setting methods to be chained in a single statement.
250 * @see #getIndentAllElements()
251 */
252 public SourceFormatter setIndentAllElements(final boolean indentAllElements) {
253 this.indentAllElements=indentAllElements;
254 return this;
255 }
256
257 /**
258 * Indicates whether all elements are to be indented, including {@linkplain HTMLElements#getInlineLevelElementNames() inline-level elements} and those with preformatted contents.
259 * <p>
260 * See the {@link #setIndentAllElements(boolean)} method for a full description of this property.
261 *
262 * @return <code>true</code> if all elements are to be indented, otherwise <code>false</code>.
263 */
264 public boolean getIndentAllElements() {
265 return indentAllElements;
266 }
267
268 /**
269 * Sets the string to be used to represent a <a target="_blank" href="http://en.wikipedia.org/wiki/Newline">newline</a> in the output.
270 * <p>
271 * The default is to use the same new line string as is used in the source document, which is determined via the {@link Source#getNewLine()} method.
272 * If the source document does not contain any new lines, a "best guess" is made by either taking the new line string of a previously parsed document,
273 * or using the value from the static {@link Config#NewLine} property.
274 * <p>
275 * Specifying a <code>null</code> argument resets the property to its default value, which is to use the same new line string as is used in the source document.
276 *
277 * @param newLine the string to be used to represent a <a target="_blank" href="http://en.wikipedia.org/wiki/Newline">newline</a> in the output, may be <code>null</code>.
278 * @return this <code>SourceFormatter</code> instance, allowing multiple property setting methods to be chained in a single statement.
279 * @see #getNewLine()
280 */
281 public SourceFormatter setNewLine(final String newLine) {
282 this.newLine=newLine;
283 return this;
284 }
285
286 /**
287 * Returns the string to be used to represent a <a target="_blank" href="http://en.wikipedia.org/wiki/Newline">newline</a> in the output.
288 * <p>
289 * See the {@link #setNewLine(String)} method for a full description of this property.
290 *
291 * @return the string to be used to represent a <a target="_blank" href="http://en.wikipedia.org/wiki/Newline">newline</a> in the output.
292 */
293 public String getNewLine() {
294 if (newLine==null) newLine=segment.source.getBestGuessNewLine();
295 return newLine;
296 }
297
298 /** This class does the actual work, but is first passed final copies of all the parameters for efficiency. */
299 private static final class Processor {
300 private final Segment segment;
301 private final CharSequence sourceText;
302 private final String indentString;
303 private final boolean tidyTags;
304 private final boolean collapseWhiteSpace;
305 private final boolean removeLineBreaks; // Indicates whether all non-essential line breaks are removed. Must be used with collapseWhiteSpace=true.
306 private final boolean indentAllElements;
307 private final boolean indentScriptElements; // at present this parameter is tied to indentAllElements. SCRIPT elements need to be inline to keep functional equivalency of output
308 private final String newLine;
309
310 private Appendable appendable;
311 private Tag nextTag;
312 private int index;
313
314 public Processor(final Segment segment, final String indentString, final boolean tidyTags, final boolean collapseWhiteSpace, final boolean removeLineBreaks, final boolean indentAllElements, final boolean indentScriptElements, final String newLine) {
315 this.segment=segment;
316 sourceText=segment.source.toString();
317 this.indentString=indentString;
318 this.tidyTags=tidyTags;
319 this.collapseWhiteSpace=collapseWhiteSpace || removeLineBreaks;
320 this.removeLineBreaks=removeLineBreaks;
321 this.indentAllElements=indentAllElements;
322 this.indentScriptElements=indentScriptElements;
323 this.newLine=newLine;
324 }
325
326 public void appendTo(final Appendable appendable) throws IOException {
327 this.appendable=appendable;
328 if (segment instanceof Source) ((Source)segment).fullSequentialParse();
329 nextTag=segment.source.getNextTag(segment.begin);
330 index=segment.begin;
331 appendContent(segment.end,segment.getChildElements(),0);
332 }
333
334 private void appendContent(final int end, final List<Element> childElements, final int depth) throws IOException {
335 assert index<=end;
336 for (Element element : childElements) {
337 final int elementBegin=element.begin;
338 if (elementBegin>=end) break;
339 if (indentAllElements) {
340 appendText(elementBegin,depth);
341 appendElement(element,depth,end,false,false);
342 } else {
343 if (inlinable(element)) continue; // skip over elements that can be inlined.
344 appendText(elementBegin,depth);
345 final String elementName=element.getName();
346 if (elementName==HTMLElementName.PRE || elementName==HTMLElementName.TEXTAREA) {
347 appendElement(element,depth,end,true,true);
348 } else if (elementName==HTMLElementName.SCRIPT) {
349 appendElement(element,depth,end,true,false);
350 } else {
351 appendElement(element,depth,end,false,!removeLineBreaks && containsOnlyInlineLevelChildElements(element));
352 }
353 }
354 }
355 appendText(end,depth);
356 assert index==end;
357 }
358
359 private boolean inlinable(final Element element) {
360 // returns true if the specified element should be inlined
361 final StartTagType startTagType=element.getStartTag().getStartTagType();
362 // if (startTagType==StartTagType.DOCTYPE_DECLARATION) return false; // this was removed because it caused an extra line break if the DOCTYPE is preceeded by a server tag
363 if (startTagType!=StartTagType.NORMAL) return true;
364 // element is a normal type
365 final String elementName=element.getName();
366 if (elementName==HTMLElementName.SCRIPT) return !indentScriptElements;
367 if (removeLineBreaks && !HTMLElements.getElementNames().contains(elementName)) return true; // inline non-HTML elements if removing line breaks
368 if (!HTMLElements.getInlineLevelElementNames().contains(elementName)) return false;
369 // element is inline type
370 if (removeLineBreaks) return true;
371 if (elementName==HTMLElementName.TEXTAREA) return false; // TEXTAREA is theoretically inlinable but we want to format its content in the same was as PRE, and this is easiest when the entire element is treated like a block PRE element.
372 return containsOnlyInlineLevelChildElements(element); // only inline if it doesn't illegally contain non-inline elements
373 }
374
375 private void appendText(final int end, int depth) throws IOException {
376 assert index<=end;
377 if (index==end) return;
378 while (Segment.isWhiteSpace(sourceText.charAt(index))) if (++index==end) return; // trim whitespace.
379 appendIndent(depth);
380 if (collapseWhiteSpace) {
381 appendTextCollapseWhiteSpace(end,depth);
382 } else {
383 appendTextInline(end,depth,false);
384 }
385 appendFormattingNewLine();
386 assert index==end;
387 }
388
389 private void appendElement(final Element element, final int depth, final int end, final boolean preformatted, boolean renderContentInline) throws IOException {
390 assert index==element.begin;
391 assert index<end;
392 final StartTag startTag=element.getStartTag();
393 final EndTag endTag=element.getEndTag();
394 appendIndent(depth);
395 appendTag(startTag,depth,end);
396 if (index==end) {
397 appendFormattingNewLine();
398 assert index==Math.min(element.end,end) : index;
399 return;
400 }
401 if (!renderContentInline) appendFormattingNewLine();
402 int contentEnd=element.getContentEnd();
403 if (end<contentEnd) contentEnd=end;
404 if (index<contentEnd) {
405 if (preformatted) {
406 if (renderContentInline) {
407 // Preformatted element such as PRE, TEXTAREA
408 appendContentPreformatted(contentEnd,depth);
409 } else {
410 // SCRIPT element
411 appendIndentedScriptContent(contentEnd,depth+1);
412 }
413 } else {
414 if (renderContentInline) {
415 // Element contains only inline-level elements, so don't bother putting start and end tags on separate lines
416 if (collapseWhiteSpace) {
417 appendTextCollapseWhiteSpace(contentEnd,depth);
418 } else {
419 if (!appendTextInline(contentEnd,depth,true)) {
420 appendFormattingNewLine();
421 renderContentInline=false;
422 }
423 }
424 } else {
425 appendContent(contentEnd,element.getChildElements(),depth+1);
426 }
427 }
428 }
429 if (endTag!=null && end>endTag.begin) {
430 if (!renderContentInline) appendIndent(depth);
431 assert index==endTag.begin;
432 appendTag(endTag,depth,end);
433 appendFormattingNewLine();
434 } else if (renderContentInline) {
435 appendFormattingNewLine();
436 }
437 assert index==Math.min(element.end,end) : index;
438 }
439
440 private void updateNextTag() {
441 // ensures that nextTag is up to date
442 while (nextTag!=null) {
443 if (nextTag.begin>=index) return;
444 nextTag=nextTag.getNextTag();
445 }
446 }
447
448 private void appendIndentedScriptContent(final int end, final int depth) throws IOException {
449 assert index<end;
450 if (removeLineBreaks) {
451 appendTextRemoveIndentation(end);
452 assert index==end;
453 return;
454 }
455 int startOfLinePos=getStartOfLinePos(end,false);
456 if (index==end) return;
457 if (startOfLinePos==-1) {
458 // Script started on same line as start tag. Use the start of the next line to determine the original indent.
459 appendIndent(depth);
460 appendLineKeepWhiteSpace(end,depth);
461 appendEssentialNewLine();
462 if (index==end) return;
463 startOfLinePos=getStartOfLinePos(end,true);
464 if (index==end) return;
465 }
466 appendTextPreserveIndentation(end,depth,index-startOfLinePos);
467 appendEssentialNewLine();
468 assert index==end;
469 }
470
471 private boolean appendTextPreserveIndentation(final int end, final int depth) throws IOException {
472 // returns true if all text was on one line, otherwise false
473 assert index<end;
474 if (removeLineBreaks) return appendTextRemoveIndentation(end);
475 // Use the start of the next line to determine the original indent.
476 appendLineKeepWhiteSpace(end,depth);
477 if (index==end) return true;
478 int startOfLinePos=getStartOfLinePos(end,true);
479 if (index==end) return true;
480 appendEssentialNewLine();
481 appendTextPreserveIndentation(end,depth+1,index-startOfLinePos);
482 assert index==end;
483 return false;
484 }
485
486 private void appendTextPreserveIndentation(final int end, final int depth, final int originalIndentLength) throws IOException {
487 assert index<end;
488 appendIndent(depth);
489 appendLineKeepWhiteSpace(end,depth);
490 while (index!=end) {
491 // Skip over the original indent:
492 for (int x=0; x<originalIndentLength; x++) {
493 final char ch=sourceText.charAt(index);
494 if (!(ch==' ' || ch=='\t')) break;
495 if (++index==end) return;
496 }
497 appendEssentialNewLine();
498 // Insert our indent:
499 appendIndent(depth);
500 // Write the rest of the line including any indent greater than the first line's indent:
501 appendLineKeepWhiteSpace(end,depth);
502 }
503 assert index==end;
504 }
505
506 private boolean appendTextRemoveIndentation(final int end) throws IOException {
507 assert index<end;
508 appendLineKeepWhiteSpace(end,0);
509 if (index==end) return true;
510 while (index!=end) {
511 // Skip over the original indent:
512 while (true) {
513 final char ch=sourceText.charAt(index);
514 if (!(ch==' ' || ch=='\t')) break;
515 if (++index==end) return false;
516 }
517 appendEssentialNewLine();
518 // Write the rest of the line including any indent greater than the first line's indent:
519 appendLineKeepWhiteSpace(end,0);
520 }
521 assert index==end;
522 return false;
523 }
524
525 private int getStartOfLinePos(final int end, final boolean atStartOfLine) {
526 // returns the starting position of the next complete line containing text, or -1 if texts starts on the current line (hence not a complete line).
527 // sets index to the start of the text following the returned position, or end, whichever comes first.
528 int startOfLinePos=atStartOfLine ? index : -1;
529 while (true) {
530 final char ch=sourceText.charAt(index);
531 if (ch=='\n' || ch=='\r') {
532 startOfLinePos=index+1;
533 } else if (!(ch==' ' || ch=='\t')) break;
534 if (++index==end) break;
535 }
536 return startOfLinePos;
537 }
538
539 private void appendSpecifiedTextInline(final CharSequence text, int depth) throws IOException {
540 final int textLength=text.length();
541 int i=appendSpecifiedLine(text,0);
542 if (i<textLength) {
543 final int subsequentLineDepth=depth+1;
544 do {
545 while (Segment.isWhiteSpace(text.charAt(i))) if (++i>=textLength) return; // trim whitespace.
546 appendEssentialNewLine();
547 appendIndent(subsequentLineDepth);
548 i=appendSpecifiedLine(text,i);
549 } while (i<textLength);
550 }
551 }
552
553 private int appendSpecifiedLine(final CharSequence text, int i) throws IOException {
554 // Writes the first line from the specified text starting from the specified position.
555 // The line break characters are not written.
556 // Returns the position following the first line break character(s), or text.length() if the text contains no line breaks.
557 final int textLength=text.length();
558 while (true) {
559 final char ch=text.charAt(i);
560 if (ch=='\r') {
561 final int nexti=i+1;
562 if (nexti<textLength && text.charAt(nexti)=='\n') return i+2;
563 }
564 if (ch=='\n') return i+1;
565 appendable.append(ch);
566 if (++i>=textLength) return i;
567 }
568 }
569
570 private boolean appendTextInline(final int end, int depth, final boolean increaseIndentAfterFirstLineBreak) throws IOException {
571 // returns true if all text was on one line, otherwise false
572 assert index<end;
573 appendLineKeepWhiteSpace(end,depth);
574 if (index==end) return true;
575 final int subsequentLineDepth=increaseIndentAfterFirstLineBreak ? depth+1 : depth;
576 do {
577 while (Segment.isWhiteSpace(sourceText.charAt(index))) if (++index==end) return false; // trim whitespace.
578 appendEssentialNewLine(); // essential because we might be inside a tag attribute value. If new lines in normal text aren't required this method wouldn't have been called.
579 appendIndent(subsequentLineDepth);
580 appendLineKeepWhiteSpace(end,subsequentLineDepth);
581 } while (index<end);
582 assert index==end;
583 return false;
584 }
585
586 private void appendLineKeepWhiteSpace(final int end, final int depth) throws IOException {
587 // Writes the first line from the source text starting from index, ending at the specified end position.
588 // The line break characters are not written.
589 // Sets index to the position following the first line break character(s), or end if the text contains no line breaks, guaranteed index<=end.
590 // Any tags encountered are written using the appendTag method, whose output may include line breaks.
591 assert index<end;
592 updateNextTag();
593 while (true) {
594 while (nextTag!=null && index==nextTag.begin) {
595 appendTag(nextTag,depth,end);
596 if (index==end) return;
597 }
598 final char ch=sourceText.charAt(index);
599 if (ch=='\r') {
600 final int nextindex=index+1;
601 if (nextindex<end && sourceText.charAt(nextindex)=='\n') {
602 index+=2;
603 assert index<=end;
604 return;
605 }
606 }
607 if (ch=='\n') {
608 index++;
609 assert index<=end;
610 return;
611 }
612 appendable.append(ch);
613 if (++index==end) return;
614 }
615 }
616
617 private void appendTextCollapseWhiteSpace(final int end, final int depth) throws IOException {
618 assert index<end;
619 boolean lastWasWhiteSpace=false;
620 updateNextTag();
621 while (index<end) {
622 while (nextTag!=null && index==nextTag.begin) {
623 if (lastWasWhiteSpace) {
624 appendable.append(' ');
625 lastWasWhiteSpace=false;
626 }
627 appendTag(nextTag,depth,end);
628 if (index==end) return;
629 }
630 final char ch=sourceText.charAt(index++);
631 if (Segment.isWhiteSpace(ch)) {
632 lastWasWhiteSpace=true;
633 } else {
634 if (lastWasWhiteSpace) {
635 appendable.append(' ');
636 lastWasWhiteSpace=false;
637 }
638 appendable.append(ch);
639 }
640 }
641 if (lastWasWhiteSpace) appendable.append(' ');
642 assert index==end;
643 }
644
645 private void appendContentPreformatted(final int end, final int depth) throws IOException {
646 assert index<end;
647 updateNextTag();
648 do {
649 while (nextTag!=null && index==nextTag.begin) {
650 appendTag(nextTag,depth,end);
651 if (index==end) return;
652 }
653 appendable.append(sourceText.charAt(index));
654 } while (++index<end);
655 assert index==end;
656 }
657
658 private void appendTag(final Tag tag, final int depth, final int end) throws IOException {
659 // sets index to last position written
660 assert index==tag.begin;
661 assert index<end;
662 nextTag=tag.getNextTag();
663 final int tagEnd=(tag.end<end) ? tag.end : end;
664 assert index<tagEnd;
665 if (tag.getTagType()==StartTagType.COMMENT || tag.getTagType()==StartTagType.CDATA_SECTION || tag.getTagType().isServerTag()) {
666 appendTextPreserveIndentation(tagEnd,depth);
667 } else if (tidyTags) {
668 final String tidyTag=tag.tidy();
669 if ((tag instanceof StartTag) && ((StartTag)tag).getAttributes()!=null)
670 appendable.append(tidyTag);
671 else
672 appendSpecifiedTextInline(tidyTag,depth);
673 index=tagEnd;
674 } else {
675 appendTextInline(tagEnd,depth,true); // Write tag keeping linefeeds. This will add an indent to any attribute values containing linefeeds, but the normal situation where line breaks are between attributes will look nice.
676 }
677 if (end<=tag.end || !(tag instanceof StartTag)) {
678 assert index<=end;
679 return;
680 }
681 if ((tag.name==HTMLElementName.SCRIPT && !indentScriptElements) || tag.getTagType().isServerTag()) {
682 // NOTE SERVER ELEMENTS CONTAINING NON-INLINE TAGS WILL NOT FORMAT PROPERLY. NEED TO INVESTIGATE INCLUDING SUCH SERVER ELEMENTS IN DOCUMENT HIERARCHY.
683 // this is a script or server start tag, we may need to append the whole element:
684 final Element element=tag.getElement();
685 final EndTag endTag=element.getEndTag();
686 if (endTag==null) {
687 assert index<=end;
688 return;
689 }
690 final int contentEnd=(end<endTag.begin) ? end : endTag.begin;
691 boolean singleLineContent=true;
692 if (index!=contentEnd) {
693 // elementContainsMarkup should be made into a TagType property one day.
694 // for the time being assume all server element content is code, although this is not true for some Mason elements.
695 final boolean elementContainsMarkup=false;
696 if (elementContainsMarkup) {
697 singleLineContent=appendTextInline(contentEnd,depth+1,false);
698 } else {
699 singleLineContent=appendTextPreserveIndentation(contentEnd,depth);
700 }
701 }
702 if (endTag.begin>=end) {
703 assert index<=end;
704 return;
705 }
706 if (!singleLineContent) {
707 appendEssentialNewLine(); // some server or client side scripting languages might need the final new line
708 appendIndent(depth);
709 }
710 assert index==endTag.begin;
711 appendTag(endTag,depth,end);
712 }
713 assert index<=end;
714 }
715
716 private void appendIndent(final int depth) throws IOException {
717 if (!removeLineBreaks) for (int x=0; x<depth; x++) appendable.append(indentString);
718 }
719
720 private void appendFormattingNewLine() throws IOException {
721 if (!removeLineBreaks) appendable.append(newLine);
722 }
723
724 private void appendEssentialNewLine() throws IOException {
725 appendable.append(newLine);
726 }
727
728 private boolean containsOnlyInlineLevelChildElements(final Element element) {
729 // returns true if the element contains only inline-level elements except for SCRIPT elements.
730 final Collection<Element> childElements=element.getChildElements();
731 if (childElements.isEmpty()) return true;
732 for (Element childElement : childElements) {
733 final String elementName=childElement.getName();
734 if (elementName==HTMLElementName.SCRIPT || !HTMLElements.getInlineLevelElementNames().contains(elementName)) return false;
735 if (!containsOnlyInlineLevelChildElements(childElement)) return false;
736 }
737 return true;
738 }
739 }
740 }

   
Visit the aagtl Website