1 |
// Jericho HTML Parser - Java based library for analysing and manipulating HTML
|
2 |
// Version 3.2
|
3 |
// Copyright (C) 2004-2009 Martin Jericho
|
4 |
// http://jericho.htmlparser.net/
|
5 |
//
|
6 |
// This library is free software; you can redistribute it and/or
|
7 |
// modify it under the terms of either one of the following licences:
|
8 |
//
|
9 |
// 1. The Eclipse Public License (EPL) version 1.0,
|
10 |
// included in this distribution in the file licence-epl-1.0.html
|
11 |
// or available at http://www.eclipse.org/legal/epl-v10.html
|
12 |
//
|
13 |
// 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
|
14 |
// included in this distribution in the file licence-lgpl-2.1.txt
|
15 |
// or available at http://www.gnu.org/licenses/lgpl.txt
|
16 |
//
|
17 |
// This library is distributed on an "AS IS" basis,
|
18 |
// WITHOUT WARRANTY OF ANY KIND, either express or implied.
|
19 |
// See the individual licence texts for more details.
|
20 |
|
21 |
package net.htmlparser.jericho;
|
22 |
|
23 |
import java.util.*;
|
24 |
import java.io.*;
|
25 |
import java.net.*;
|
26 |
|
27 |
/**
|
28 |
* Formats HTML source by laying out each non-inline-level element on a new line with an appropriate indent.
|
29 |
* <p>
|
30 |
* Any indentation present in the original source text is removed.
|
31 |
* <p>
|
32 |
* Use one of the following methods to obtain the output:
|
33 |
* <ul>
|
34 |
* <li>{@link #writeTo(Writer)}</li>
|
35 |
* <li>{@link #appendTo(Appendable)}</li>
|
36 |
* <li>{@link #toString()}</li>
|
37 |
* <li>{@link CharStreamSourceUtil#getReader(CharStreamSource) CharStreamSourceUtil.getReader(this)}</li>
|
38 |
* </ul>
|
39 |
* <p>
|
40 |
* The output text is functionally equivalent to the original source and should be rendered identically unless specified below.
|
41 |
* <p>
|
42 |
* The following points describe the process in general terms.
|
43 |
* Any aspect of the algorithm not specifically mentioned here is subject to change without notice in future versions.
|
44 |
* <p>
|
45 |
* <ul>
|
46 |
* <li>Every element that is not an {@linkplain HTMLElements#getInlineLevelElementNames() inline-level element} appears on a new line
|
47 |
* with an indent corresponding to its {@linkplain Element#getDepth() depth} in the <a href="Source.html#DocumentElementHierarchy">document element hierarchy</a>.
|
48 |
* <li>The indent is formed by writing <i>n</i> repetitions of the string specified in the {@link #setIndentString(String) IndentString} property,
|
49 |
* where <i>n</i> is the depth of the indentation.
|
50 |
* <li>The {@linkplain Element#getContent() content} of an indented element starts on a new line and is indented at a depth one greater than that of the element,
|
51 |
* with the end tag appearing on a new line at the same depth as the start tag.
|
52 |
* If the content contains only text and {@linkplain HTMLElements#getInlineLevelElementNames() inline-level elements},
|
53 |
* it may continue on the same line as the start tag. Additionally, if the output content contains no new lines, the end tag may also continue on the same line.
|
54 |
* <li>The content of preformatted elements such as {@link HTMLElementName#PRE PRE} and {@link HTMLElementName#TEXTAREA TEXTAREA} are not indented,
|
55 |
* nor is the white space modified in any way.
|
56 |
* <li>Only {@linkplain StartTagType#NORMAL normal} and {@linkplain StartTagType#DOCTYPE_DECLARATION document type declaration} elements are indented.
|
57 |
* All others are treated as {@linkplain HTMLElements#getInlineLevelElementNames() inline-level elements}.
|
58 |
* <li>White space and indentation inside HTML {@linkplain StartTagType#COMMENT comments}, {@linkplain StartTagType#CDATA_SECTION CDATA sections}, or any
|
59 |
* {@linkplain TagType#isServerTag() server tag} is preserved,
|
60 |
* but with the indentation of new lines starting at a depth one greater than that of the surrounding text.
|
61 |
* <li>White space and indentation inside {@link HTMLElementName#SCRIPT SCRIPT} elements is preserved,
|
62 |
* but with the indentation of new lines starting at a depth one greater than that of the <code>SCRIPT</code> element.
|
63 |
* <li>If the {@link #setTidyTags(boolean) TidyTags} property is set to <code>true</code>,
|
64 |
* every tag in the document is replaced with the output from its {@link Tag#tidy()} method.
|
65 |
* If this property is set to <code>false</code>, the tag from the original text is used, including all white space,
|
66 |
* but with any new lines indented at a depth one greater than that of the element.
|
67 |
* <li>If the {@link #setCollapseWhiteSpace(boolean) CollapseWhiteSpace} property
|
68 |
* is set to <code>true</code>, every string of one or more {@linkplain Segment#isWhiteSpace(char) white space} characters
|
69 |
* located outside of a tag is replaced with a single space in the output.
|
70 |
* White space located adjacent to a non-inline-level element tag (except {@linkplain TagType#isServerTag() server tags}) may be removed.
|
71 |
* <li>If the {@link #setIndentAllElements(boolean) IndentAllElements} property
|
72 |
* is set to <code>true</code>, every element appears indented on a new line, including {@linkplain HTMLElements#getInlineLevelElementNames() inline-level elements}.
|
73 |
* This generates output that is a good representation of the actual <a href="Source.html#DocumentElementHierarchy">document element hierarchy</a>,
|
74 |
* but is very likely to introduce white space that compromises the functional equivalency of the document.
|
75 |
* <li>The {@link #setNewLine(String) NewLine} property specifies the character sequence
|
76 |
* to use for each <a target="_blank" href="http://en.wikipedia.org/wiki/Newline">newline</a> in the output document.
|
77 |
* <li>If the source document contains {@linkplain TagType#isServerTag() server tags}, the functional equivalency of the output document may be compromised.
|
78 |
* </ul>
|
79 |
* <p>
|
80 |
* Formatting an entire {@link Source} object performs a {@linkplain Source#fullSequentialParse() full sequential parse} automatically.
|
81 |
*/
|
82 |
public final class SourceFormatter implements CharStreamSource {
|
83 |
private final Segment segment;
|
84 |
private String indentString="\t";
|
85 |
private boolean tidyTags=false;
|
86 |
private boolean collapseWhiteSpace=false;
|
87 |
private boolean removeLineBreaks=false;
|
88 |
private boolean indentAllElements=false;
|
89 |
private String newLine=null;
|
90 |
|
91 |
/**
|
92 |
* Constructs a new <code>SourceFormatter</code> based on the specified {@link Segment}.
|
93 |
* @param segment the segment containing the HTML to be formatted.
|
94 |
* @see Source#getSourceFormatter()
|
95 |
*/
|
96 |
public SourceFormatter(final Segment segment) {
|
97 |
this.segment=segment;
|
98 |
}
|
99 |
|
100 |
// Documentation inherited from CharStreamSource
|
101 |
public void writeTo(final Writer writer) throws IOException {
|
102 |
appendTo(writer);
|
103 |
writer.flush();
|
104 |
}
|
105 |
|
106 |
// Documentation inherited from CharStreamSource
|
107 |
public void appendTo(final Appendable appendable) throws IOException {
|
108 |
new Processor(segment,getIndentString(),getTidyTags(),getCollapseWhiteSpace(),getRemoveLineBreaks(),getIndentAllElements(),getIndentAllElements(),getNewLine()).appendTo(appendable);
|
109 |
}
|
110 |
|
111 |
// Documentation inherited from CharStreamSource
|
112 |
public long getEstimatedMaximumOutputLength() {
|
113 |
return segment.length()*2;
|
114 |
}
|
115 |
|
116 |
// Documentation inherited from CharStreamSource
|
117 |
public String toString() {
|
118 |
return CharStreamSourceUtil.toString(this);
|
119 |
}
|
120 |
|
121 |
/**
|
122 |
* Sets the string to be used for indentation.
|
123 |
* <p>
|
124 |
* The default value is a string containing a single tab character (U+0009).
|
125 |
* <p>
|
126 |
* The most commonly used indent strings are <code>"\t"</code> (single tab), <code>" "</code> (single space), <code>" "</code> (2 spaces), and <code>" "</code> (4 spaces).
|
127 |
*
|
128 |
* @param indentString the string to be used for indentation, must not be <code>null</code>.
|
129 |
* @return this <code>SourceFormatter</code> instance, allowing multiple property setting methods to be chained in a single statement.
|
130 |
* @see #getIndentString()
|
131 |
*/
|
132 |
public SourceFormatter setIndentString(final String indentString) {
|
133 |
if (indentString==null) throw new IllegalArgumentException("indentString property must not be null");
|
134 |
this.indentString=indentString;
|
135 |
return this;
|
136 |
}
|
137 |
|
138 |
/**
|
139 |
* Returns the string to be used for indentation.
|
140 |
* <p>
|
141 |
* See the {@link #setIndentString(String)} method for a full description of this property.
|
142 |
*
|
143 |
* @return the string to be used for indentation.
|
144 |
*/
|
145 |
public String getIndentString() {
|
146 |
return indentString;
|
147 |
}
|
148 |
|
149 |
/**
|
150 |
* Sets whether the original text of each tag is to be replaced with the output from its {@link Tag#tidy()} method.
|
151 |
* <p>
|
152 |
* The default value is <code>false</code>.
|
153 |
* <p>
|
154 |
* If this property is set to <code>false</code>, the tag from the original text is used, including all white space,
|
155 |
* but with any new lines indented at a depth one greater than that of the element.
|
156 |
*
|
157 |
* @param tidyTags specifies whether the original text of each tag is to be replaced with the output from its {@link Tag#tidy()} method.
|
158 |
* @return this <code>SourceFormatter</code> instance, allowing multiple property setting methods to be chained in a single statement.
|
159 |
* @see #getTidyTags()
|
160 |
*/
|
161 |
public SourceFormatter setTidyTags(final boolean tidyTags) {
|
162 |
this.tidyTags=tidyTags;
|
163 |
return this;
|
164 |
}
|
165 |
|
166 |
/**
|
167 |
* Indicates whether the original text of each tag is to be replaced with the output from its {@link Tag#tidy()} method.
|
168 |
* <p>
|
169 |
* See the {@link #setTidyTags(boolean)} method for a full description of this property.
|
170 |
*
|
171 |
* @return <code>true</code> if the original text of each tag is to be replaced with the output from its {@link Tag#tidy()} method, otherwise <code>false</code>.
|
172 |
*/
|
173 |
public boolean getTidyTags() {
|
174 |
return tidyTags;
|
175 |
}
|
176 |
|
177 |
/**
|
178 |
* Sets whether {@linkplain Segment#isWhiteSpace(char) white space} in the text between the tags is to be collapsed.
|
179 |
* <p>
|
180 |
* The default value is <code>false</code>.
|
181 |
* <p>
|
182 |
* If this property is set to <code>true</code>, every string of one or more {@linkplain Segment#isWhiteSpace(char) white space} characters
|
183 |
* located outside of a tag is replaced with a single space in the output.
|
184 |
* White space located adjacent to a non-inline-level element tag (except {@linkplain TagType#isServerTag() server tags}) may be removed.
|
185 |
*
|
186 |
* @param collapseWhiteSpace specifies whether {@linkplain Segment#isWhiteSpace(char) white space} in the text between the tags is to be collapsed.
|
187 |
* @return this <code>SourceFormatter</code> instance, allowing multiple property setting methods to be chained in a single statement.
|
188 |
* @see #getCollapseWhiteSpace()
|
189 |
*/
|
190 |
public SourceFormatter setCollapseWhiteSpace(final boolean collapseWhiteSpace) {
|
191 |
this.collapseWhiteSpace=collapseWhiteSpace;
|
192 |
return this;
|
193 |
}
|
194 |
|
195 |
/**
|
196 |
* Indicates whether {@linkplain Segment#isWhiteSpace(char) white space} in the text between the tags is to be collapsed.
|
197 |
* <p>
|
198 |
* See the {@link #setCollapseWhiteSpace(boolean collapseWhiteSpace)} method for a full description of this property.
|
199 |
*
|
200 |
* @return <code>true</code> if {@linkplain Segment#isWhiteSpace(char) white space} in the text between the tags is to be collapsed, otherwise <code>false</code>.
|
201 |
*/
|
202 |
public boolean getCollapseWhiteSpace() {
|
203 |
return collapseWhiteSpace;
|
204 |
}
|
205 |
|
206 |
/**
|
207 |
* Sets whether all non-essential line breaks are removed.
|
208 |
* <p>
|
209 |
* The default value is <code>false</code>.
|
210 |
* <p>
|
211 |
* If this property is set to <code>true</code>, only essential line breaks are retained in the output.
|
212 |
* <p>
|
213 |
* Setting this property automatically engages the {@link #setCollapseWhiteSpace(boolean) CollapseWhiteSpace} option, regardless of its property setting.
|
214 |
* <p>
|
215 |
* It is recommended to set the {@link #setTidyTags(boolean) TidyTags} property when this option is used so that non-essential line breaks are also removed from tags.
|
216 |
*
|
217 |
* @param removeLineBreaks specifies whether all non-essential line breaks are removed.
|
218 |
* @return this <code>SourceFormatter</code> instance, allowing multiple property setting methods to be chained in a single statement.
|
219 |
* @see #getRemoveLineBreaks()
|
220 |
*/
|
221 |
SourceFormatter setRemoveLineBreaks(final boolean removeLineBreaks) {
|
222 |
this.removeLineBreaks=removeLineBreaks;
|
223 |
return this;
|
224 |
}
|
225 |
|
226 |
/**
|
227 |
* Indicates whether all non-essential line breaks are removed.
|
228 |
* <p>
|
229 |
* See the {@link #setRemoveLineBreaks(boolean removeLineBreaks)} method for a full description of this property.
|
230 |
*
|
231 |
* @return <code>true</code> if all non-essential line breaks are removed, otherwise <code>false</code>.
|
232 |
*/
|
233 |
boolean getRemoveLineBreaks() {
|
234 |
return removeLineBreaks;
|
235 |
}
|
236 |
|
237 |
/**
|
238 |
* Sets whether all elements are to be indented, including {@linkplain HTMLElements#getInlineLevelElementNames() inline-level elements} and those with preformatted contents.
|
239 |
* <p>
|
240 |
* The default value is <code>false</code>.
|
241 |
* <p>
|
242 |
* If this property is set to <code>true</code>, every element appears indented on a new line, including
|
243 |
* {@linkplain HTMLElements#getInlineLevelElementNames() inline-level elements}.
|
244 |
* <p>
|
245 |
* This generates output that is a good representation of the actual <a href="Source.html#DocumentElementHierarchy">document element hierarchy</a>,
|
246 |
* but is very likely to introduce white space that compromises the functional equivalency of the document.
|
247 |
*
|
248 |
* @param indentAllElements specifies whether all elements are to be indented.
|
249 |
* @return this <code>SourceFormatter</code> instance, allowing multiple property setting methods to be chained in a single statement.
|
250 |
* @see #getIndentAllElements()
|
251 |
*/
|
252 |
public SourceFormatter setIndentAllElements(final boolean indentAllElements) {
|
253 |
this.indentAllElements=indentAllElements;
|
254 |
return this;
|
255 |
}
|
256 |
|
257 |
/**
|
258 |
* Indicates whether all elements are to be indented, including {@linkplain HTMLElements#getInlineLevelElementNames() inline-level elements} and those with preformatted contents.
|
259 |
* <p>
|
260 |
* See the {@link #setIndentAllElements(boolean)} method for a full description of this property.
|
261 |
*
|
262 |
* @return <code>true</code> if all elements are to be indented, otherwise <code>false</code>.
|
263 |
*/
|
264 |
public boolean getIndentAllElements() {
|
265 |
return indentAllElements;
|
266 |
}
|
267 |
|
268 |
/**
|
269 |
* Sets the string to be used to represent a <a target="_blank" href="http://en.wikipedia.org/wiki/Newline">newline</a> in the output.
|
270 |
* <p>
|
271 |
* The default is to use the same new line string as is used in the source document, which is determined via the {@link Source#getNewLine()} method.
|
272 |
* If the source document does not contain any new lines, a "best guess" is made by either taking the new line string of a previously parsed document,
|
273 |
* or using the value from the static {@link Config#NewLine} property.
|
274 |
* <p>
|
275 |
* Specifying a <code>null</code> argument resets the property to its default value, which is to use the same new line string as is used in the source document.
|
276 |
*
|
277 |
* @param newLine the string to be used to represent a <a target="_blank" href="http://en.wikipedia.org/wiki/Newline">newline</a> in the output, may be <code>null</code>.
|
278 |
* @return this <code>SourceFormatter</code> instance, allowing multiple property setting methods to be chained in a single statement.
|
279 |
* @see #getNewLine()
|
280 |
*/
|
281 |
public SourceFormatter setNewLine(final String newLine) {
|
282 |
this.newLine=newLine;
|
283 |
return this;
|
284 |
}
|
285 |
|
286 |
/**
|
287 |
* Returns the string to be used to represent a <a target="_blank" href="http://en.wikipedia.org/wiki/Newline">newline</a> in the output.
|
288 |
* <p>
|
289 |
* See the {@link #setNewLine(String)} method for a full description of this property.
|
290 |
*
|
291 |
* @return the string to be used to represent a <a target="_blank" href="http://en.wikipedia.org/wiki/Newline">newline</a> in the output.
|
292 |
*/
|
293 |
public String getNewLine() {
|
294 |
if (newLine==null) newLine=segment.source.getBestGuessNewLine();
|
295 |
return newLine;
|
296 |
}
|
297 |
|
298 |
/** This class does the actual work, but is first passed final copies of all the parameters for efficiency. */
|
299 |
private static final class Processor {
|
300 |
private final Segment segment;
|
301 |
private final CharSequence sourceText;
|
302 |
private final String indentString;
|
303 |
private final boolean tidyTags;
|
304 |
private final boolean collapseWhiteSpace;
|
305 |
private final boolean removeLineBreaks; // Indicates whether all non-essential line breaks are removed. Must be used with collapseWhiteSpace=true.
|
306 |
private final boolean indentAllElements;
|
307 |
private final boolean indentScriptElements; // at present this parameter is tied to indentAllElements. SCRIPT elements need to be inline to keep functional equivalency of output
|
308 |
private final String newLine;
|
309 |
|
310 |
private Appendable appendable;
|
311 |
private Tag nextTag;
|
312 |
private int index;
|
313 |
|
314 |
public Processor(final Segment segment, final String indentString, final boolean tidyTags, final boolean collapseWhiteSpace, final boolean removeLineBreaks, final boolean indentAllElements, final boolean indentScriptElements, final String newLine) {
|
315 |
this.segment=segment;
|
316 |
sourceText=segment.source.toString();
|
317 |
this.indentString=indentString;
|
318 |
this.tidyTags=tidyTags;
|
319 |
this.collapseWhiteSpace=collapseWhiteSpace || removeLineBreaks;
|
320 |
this.removeLineBreaks=removeLineBreaks;
|
321 |
this.indentAllElements=indentAllElements;
|
322 |
this.indentScriptElements=indentScriptElements;
|
323 |
this.newLine=newLine;
|
324 |
}
|
325 |
|
326 |
public void appendTo(final Appendable appendable) throws IOException {
|
327 |
this.appendable=appendable;
|
328 |
if (segment instanceof Source) ((Source)segment).fullSequentialParse();
|
329 |
nextTag=segment.source.getNextTag(segment.begin);
|
330 |
index=segment.begin;
|
331 |
appendContent(segment.end,segment.getChildElements(),0);
|
332 |
}
|
333 |
|
334 |
private void appendContent(final int end, final List<Element> childElements, final int depth) throws IOException {
|
335 |
assert index<=end;
|
336 |
for (Element element : childElements) {
|
337 |
final int elementBegin=element.begin;
|
338 |
if (elementBegin>=end) break;
|
339 |
if (indentAllElements) {
|
340 |
appendText(elementBegin,depth);
|
341 |
appendElement(element,depth,end,false,false);
|
342 |
} else {
|
343 |
if (inlinable(element)) continue; // skip over elements that can be inlined.
|
344 |
appendText(elementBegin,depth);
|
345 |
final String elementName=element.getName();
|
346 |
if (elementName==HTMLElementName.PRE || elementName==HTMLElementName.TEXTAREA) {
|
347 |
appendElement(element,depth,end,true,true);
|
348 |
} else if (elementName==HTMLElementName.SCRIPT) {
|
349 |
appendElement(element,depth,end,true,false);
|
350 |
} else {
|
351 |
appendElement(element,depth,end,false,!removeLineBreaks && containsOnlyInlineLevelChildElements(element));
|
352 |
}
|
353 |
}
|
354 |
}
|
355 |
appendText(end,depth);
|
356 |
assert index==end;
|
357 |
}
|
358 |
|
359 |
private boolean inlinable(final Element element) {
|
360 |
// returns true if the specified element should be inlined
|
361 |
final StartTagType startTagType=element.getStartTag().getStartTagType();
|
362 |
// if (startTagType==StartTagType.DOCTYPE_DECLARATION) return false; // this was removed because it caused an extra line break if the DOCTYPE is preceeded by a server tag
|
363 |
if (startTagType!=StartTagType.NORMAL) return true;
|
364 |
// element is a normal type
|
365 |
final String elementName=element.getName();
|
366 |
if (elementName==HTMLElementName.SCRIPT) return !indentScriptElements;
|
367 |
if (removeLineBreaks && !HTMLElements.getElementNames().contains(elementName)) return true; // inline non-HTML elements if removing line breaks
|
368 |
if (!HTMLElements.getInlineLevelElementNames().contains(elementName)) return false;
|
369 |
// element is inline type
|
370 |
if (removeLineBreaks) return true;
|
371 |
if (elementName==HTMLElementName.TEXTAREA) return false; // TEXTAREA is theoretically inlinable but we want to format its content in the same was as PRE, and this is easiest when the entire element is treated like a block PRE element.
|
372 |
return containsOnlyInlineLevelChildElements(element); // only inline if it doesn't illegally contain non-inline elements
|
373 |
}
|
374 |
|
375 |
private void appendText(final int end, int depth) throws IOException {
|
376 |
assert index<=end;
|
377 |
if (index==end) return;
|
378 |
while (Segment.isWhiteSpace(sourceText.charAt(index))) if (++index==end) return; // trim whitespace.
|
379 |
appendIndent(depth);
|
380 |
if (collapseWhiteSpace) {
|
381 |
appendTextCollapseWhiteSpace(end,depth);
|
382 |
} else {
|
383 |
appendTextInline(end,depth,false);
|
384 |
}
|
385 |
appendFormattingNewLine();
|
386 |
assert index==end;
|
387 |
}
|
388 |
|
389 |
private void appendElement(final Element element, final int depth, final int end, final boolean preformatted, boolean renderContentInline) throws IOException {
|
390 |
assert index==element.begin;
|
391 |
assert index<end;
|
392 |
final StartTag startTag=element.getStartTag();
|
393 |
final EndTag endTag=element.getEndTag();
|
394 |
appendIndent(depth);
|
395 |
appendTag(startTag,depth,end);
|
396 |
if (index==end) {
|
397 |
appendFormattingNewLine();
|
398 |
assert index==Math.min(element.end,end) : index;
|
399 |
return;
|
400 |
}
|
401 |
if (!renderContentInline) appendFormattingNewLine();
|
402 |
int contentEnd=element.getContentEnd();
|
403 |
if (end<contentEnd) contentEnd=end;
|
404 |
if (index<contentEnd) {
|
405 |
if (preformatted) {
|
406 |
if (renderContentInline) {
|
407 |
// Preformatted element such as PRE, TEXTAREA
|
408 |
appendContentPreformatted(contentEnd,depth);
|
409 |
} else {
|
410 |
// SCRIPT element
|
411 |
appendIndentedScriptContent(contentEnd,depth+1);
|
412 |
}
|
413 |
} else {
|
414 |
if (renderContentInline) {
|
415 |
// Element contains only inline-level elements, so don't bother putting start and end tags on separate lines
|
416 |
if (collapseWhiteSpace) {
|
417 |
appendTextCollapseWhiteSpace(contentEnd,depth);
|
418 |
} else {
|
419 |
if (!appendTextInline(contentEnd,depth,true)) {
|
420 |
appendFormattingNewLine();
|
421 |
renderContentInline=false;
|
422 |
}
|
423 |
}
|
424 |
} else {
|
425 |
appendContent(contentEnd,element.getChildElements(),depth+1);
|
426 |
}
|
427 |
}
|
428 |
}
|
429 |
if (endTag!=null && end>endTag.begin) {
|
430 |
if (!renderContentInline) appendIndent(depth);
|
431 |
assert index==endTag.begin;
|
432 |
appendTag(endTag,depth,end);
|
433 |
appendFormattingNewLine();
|
434 |
} else if (renderContentInline) {
|
435 |
appendFormattingNewLine();
|
436 |
}
|
437 |
assert index==Math.min(element.end,end) : index;
|
438 |
}
|
439 |
|
440 |
private void updateNextTag() {
|
441 |
// ensures that nextTag is up to date
|
442 |
while (nextTag!=null) {
|
443 |
if (nextTag.begin>=index) return;
|
444 |
nextTag=nextTag.getNextTag();
|
445 |
}
|
446 |
}
|
447 |
|
448 |
private void appendIndentedScriptContent(final int end, final int depth) throws IOException {
|
449 |
assert index<end;
|
450 |
if (removeLineBreaks) {
|
451 |
appendTextRemoveIndentation(end);
|
452 |
assert index==end;
|
453 |
return;
|
454 |
}
|
455 |
int startOfLinePos=getStartOfLinePos(end,false);
|
456 |
if (index==end) return;
|
457 |
if (startOfLinePos==-1) {
|
458 |
// Script started on same line as start tag. Use the start of the next line to determine the original indent.
|
459 |
appendIndent(depth);
|
460 |
appendLineKeepWhiteSpace(end,depth);
|
461 |
appendEssentialNewLine();
|
462 |
if (index==end) return;
|
463 |
startOfLinePos=getStartOfLinePos(end,true);
|
464 |
if (index==end) return;
|
465 |
}
|
466 |
appendTextPreserveIndentation(end,depth,index-startOfLinePos);
|
467 |
appendEssentialNewLine();
|
468 |
assert index==end;
|
469 |
}
|
470 |
|
471 |
private boolean appendTextPreserveIndentation(final int end, final int depth) throws IOException {
|
472 |
// returns true if all text was on one line, otherwise false
|
473 |
assert index<end;
|
474 |
if (removeLineBreaks) return appendTextRemoveIndentation(end);
|
475 |
// Use the start of the next line to determine the original indent.
|
476 |
appendLineKeepWhiteSpace(end,depth);
|
477 |
if (index==end) return true;
|
478 |
int startOfLinePos=getStartOfLinePos(end,true);
|
479 |
if (index==end) return true;
|
480 |
appendEssentialNewLine();
|
481 |
appendTextPreserveIndentation(end,depth+1,index-startOfLinePos);
|
482 |
assert index==end;
|
483 |
return false;
|
484 |
}
|
485 |
|
486 |
private void appendTextPreserveIndentation(final int end, final int depth, final int originalIndentLength) throws IOException {
|
487 |
assert index<end;
|
488 |
appendIndent(depth);
|
489 |
appendLineKeepWhiteSpace(end,depth);
|
490 |
while (index!=end) {
|
491 |
// Skip over the original indent:
|
492 |
for (int x=0; x<originalIndentLength; x++) {
|
493 |
final char ch=sourceText.charAt(index);
|
494 |
if (!(ch==' ' || ch=='\t')) break;
|
495 |
if (++index==end) return;
|
496 |
}
|
497 |
appendEssentialNewLine();
|
498 |
// Insert our indent:
|
499 |
appendIndent(depth);
|
500 |
// Write the rest of the line including any indent greater than the first line's indent:
|
501 |
appendLineKeepWhiteSpace(end,depth);
|
502 |
}
|
503 |
assert index==end;
|
504 |
}
|
505 |
|
506 |
private boolean appendTextRemoveIndentation(final int end) throws IOException {
|
507 |
assert index<end;
|
508 |
appendLineKeepWhiteSpace(end,0);
|
509 |
if (index==end) return true;
|
510 |
while (index!=end) {
|
511 |
// Skip over the original indent:
|
512 |
while (true) {
|
513 |
final char ch=sourceText.charAt(index);
|
514 |
if (!(ch==' ' || ch=='\t')) break;
|
515 |
if (++index==end) return false;
|
516 |
}
|
517 |
appendEssentialNewLine();
|
518 |
// Write the rest of the line including any indent greater than the first line's indent:
|
519 |
appendLineKeepWhiteSpace(end,0);
|
520 |
}
|
521 |
assert index==end;
|
522 |
return false;
|
523 |
}
|
524 |
|
525 |
private int getStartOfLinePos(final int end, final boolean atStartOfLine) {
|
526 |
// returns the starting position of the next complete line containing text, or -1 if texts starts on the current line (hence not a complete line).
|
527 |
// sets index to the start of the text following the returned position, or end, whichever comes first.
|
528 |
int startOfLinePos=atStartOfLine ? index : -1;
|
529 |
while (true) {
|
530 |
final char ch=sourceText.charAt(index);
|
531 |
if (ch=='\n' || ch=='\r') {
|
532 |
startOfLinePos=index+1;
|
533 |
} else if (!(ch==' ' || ch=='\t')) break;
|
534 |
if (++index==end) break;
|
535 |
}
|
536 |
return startOfLinePos;
|
537 |
}
|
538 |
|
539 |
private void appendSpecifiedTextInline(final CharSequence text, int depth) throws IOException {
|
540 |
final int textLength=text.length();
|
541 |
int i=appendSpecifiedLine(text,0);
|
542 |
if (i<textLength) {
|
543 |
final int subsequentLineDepth=depth+1;
|
544 |
do {
|
545 |
while (Segment.isWhiteSpace(text.charAt(i))) if (++i>=textLength) return; // trim whitespace.
|
546 |
appendEssentialNewLine();
|
547 |
appendIndent(subsequentLineDepth);
|
548 |
i=appendSpecifiedLine(text,i);
|
549 |
} while (i<textLength);
|
550 |
}
|
551 |
}
|
552 |
|
553 |
private int appendSpecifiedLine(final CharSequence text, int i) throws IOException {
|
554 |
// Writes the first line from the specified text starting from the specified position.
|
555 |
// The line break characters are not written.
|
556 |
// Returns the position following the first line break character(s), or text.length() if the text contains no line breaks.
|
557 |
final int textLength=text.length();
|
558 |
while (true) {
|
559 |
final char ch=text.charAt(i);
|
560 |
if (ch=='\r') {
|
561 |
final int nexti=i+1;
|
562 |
if (nexti<textLength && text.charAt(nexti)=='\n') return i+2;
|
563 |
}
|
564 |
if (ch=='\n') return i+1;
|
565 |
appendable.append(ch);
|
566 |
if (++i>=textLength) return i;
|
567 |
}
|
568 |
}
|
569 |
|
570 |
private boolean appendTextInline(final int end, int depth, final boolean increaseIndentAfterFirstLineBreak) throws IOException {
|
571 |
// returns true if all text was on one line, otherwise false
|
572 |
assert index<end;
|
573 |
appendLineKeepWhiteSpace(end,depth);
|
574 |
if (index==end) return true;
|
575 |
final int subsequentLineDepth=increaseIndentAfterFirstLineBreak ? depth+1 : depth;
|
576 |
do {
|
577 |
while (Segment.isWhiteSpace(sourceText.charAt(index))) if (++index==end) return false; // trim whitespace.
|
578 |
appendEssentialNewLine(); // essential because we might be inside a tag attribute value. If new lines in normal text aren't required this method wouldn't have been called.
|
579 |
appendIndent(subsequentLineDepth);
|
580 |
appendLineKeepWhiteSpace(end,subsequentLineDepth);
|
581 |
} while (index<end);
|
582 |
assert index==end;
|
583 |
return false;
|
584 |
}
|
585 |
|
586 |
private void appendLineKeepWhiteSpace(final int end, final int depth) throws IOException {
|
587 |
// Writes the first line from the source text starting from index, ending at the specified end position.
|
588 |
// The line break characters are not written.
|
589 |
// Sets index to the position following the first line break character(s), or end if the text contains no line breaks, guaranteed index<=end.
|
590 |
// Any tags encountered are written using the appendTag method, whose output may include line breaks.
|
591 |
assert index<end;
|
592 |
updateNextTag();
|
593 |
while (true) {
|
594 |
while (nextTag!=null && index==nextTag.begin) {
|
595 |
appendTag(nextTag,depth,end);
|
596 |
if (index==end) return;
|
597 |
}
|
598 |
final char ch=sourceText.charAt(index);
|
599 |
if (ch=='\r') {
|
600 |
final int nextindex=index+1;
|
601 |
if (nextindex<end && sourceText.charAt(nextindex)=='\n') {
|
602 |
index+=2;
|
603 |
assert index<=end;
|
604 |
return;
|
605 |
}
|
606 |
}
|
607 |
if (ch=='\n') {
|
608 |
index++;
|
609 |
assert index<=end;
|
610 |
return;
|
611 |
}
|
612 |
appendable.append(ch);
|
613 |
if (++index==end) return;
|
614 |
}
|
615 |
}
|
616 |
|
617 |
private void appendTextCollapseWhiteSpace(final int end, final int depth) throws IOException {
|
618 |
assert index<end;
|
619 |
boolean lastWasWhiteSpace=false;
|
620 |
updateNextTag();
|
621 |
while (index<end) {
|
622 |
while (nextTag!=null && index==nextTag.begin) {
|
623 |
if (lastWasWhiteSpace) {
|
624 |
appendable.append(' ');
|
625 |
lastWasWhiteSpace=false;
|
626 |
}
|
627 |
appendTag(nextTag,depth,end);
|
628 |
if (index==end) return;
|
629 |
}
|
630 |
final char ch=sourceText.charAt(index++);
|
631 |
if (Segment.isWhiteSpace(ch)) {
|
632 |
lastWasWhiteSpace=true;
|
633 |
} else {
|
634 |
if (lastWasWhiteSpace) {
|
635 |
appendable.append(' ');
|
636 |
lastWasWhiteSpace=false;
|
637 |
}
|
638 |
appendable.append(ch);
|
639 |
}
|
640 |
}
|
641 |
if (lastWasWhiteSpace) appendable.append(' ');
|
642 |
assert index==end;
|
643 |
}
|
644 |
|
645 |
private void appendContentPreformatted(final int end, final int depth) throws IOException {
|
646 |
assert index<end;
|
647 |
updateNextTag();
|
648 |
do {
|
649 |
while (nextTag!=null && index==nextTag.begin) {
|
650 |
appendTag(nextTag,depth,end);
|
651 |
if (index==end) return;
|
652 |
}
|
653 |
appendable.append(sourceText.charAt(index));
|
654 |
} while (++index<end);
|
655 |
assert index==end;
|
656 |
}
|
657 |
|
658 |
private void appendTag(final Tag tag, final int depth, final int end) throws IOException {
|
659 |
// sets index to last position written
|
660 |
assert index==tag.begin;
|
661 |
assert index<end;
|
662 |
nextTag=tag.getNextTag();
|
663 |
final int tagEnd=(tag.end<end) ? tag.end : end;
|
664 |
assert index<tagEnd;
|
665 |
if (tag.getTagType()==StartTagType.COMMENT || tag.getTagType()==StartTagType.CDATA_SECTION || tag.getTagType().isServerTag()) {
|
666 |
appendTextPreserveIndentation(tagEnd,depth);
|
667 |
} else if (tidyTags) {
|
668 |
final String tidyTag=tag.tidy();
|
669 |
if ((tag instanceof StartTag) && ((StartTag)tag).getAttributes()!=null)
|
670 |
appendable.append(tidyTag);
|
671 |
else
|
672 |
appendSpecifiedTextInline(tidyTag,depth);
|
673 |
index=tagEnd;
|
674 |
} else {
|
675 |
appendTextInline(tagEnd,depth,true); // Write tag keeping linefeeds. This will add an indent to any attribute values containing linefeeds, but the normal situation where line breaks are between attributes will look nice.
|
676 |
}
|
677 |
if (end<=tag.end || !(tag instanceof StartTag)) {
|
678 |
assert index<=end;
|
679 |
return;
|
680 |
}
|
681 |
if ((tag.name==HTMLElementName.SCRIPT && !indentScriptElements) || tag.getTagType().isServerTag()) {
|
682 |
// NOTE SERVER ELEMENTS CONTAINING NON-INLINE TAGS WILL NOT FORMAT PROPERLY. NEED TO INVESTIGATE INCLUDING SUCH SERVER ELEMENTS IN DOCUMENT HIERARCHY.
|
683 |
// this is a script or server start tag, we may need to append the whole element:
|
684 |
final Element element=tag.getElement();
|
685 |
final EndTag endTag=element.getEndTag();
|
686 |
if (endTag==null) {
|
687 |
assert index<=end;
|
688 |
return;
|
689 |
}
|
690 |
final int contentEnd=(end<endTag.begin) ? end : endTag.begin;
|
691 |
boolean singleLineContent=true;
|
692 |
if (index!=contentEnd) {
|
693 |
// elementContainsMarkup should be made into a TagType property one day.
|
694 |
// for the time being assume all server element content is code, although this is not true for some Mason elements.
|
695 |
final boolean elementContainsMarkup=false;
|
696 |
if (elementContainsMarkup) {
|
697 |
singleLineContent=appendTextInline(contentEnd,depth+1,false);
|
698 |
} else {
|
699 |
singleLineContent=appendTextPreserveIndentation(contentEnd,depth);
|
700 |
}
|
701 |
}
|
702 |
if (endTag.begin>=end) {
|
703 |
assert index<=end;
|
704 |
return;
|
705 |
}
|
706 |
if (!singleLineContent) {
|
707 |
appendEssentialNewLine(); // some server or client side scripting languages might need the final new line
|
708 |
appendIndent(depth);
|
709 |
}
|
710 |
assert index==endTag.begin;
|
711 |
appendTag(endTag,depth,end);
|
712 |
}
|
713 |
assert index<=end;
|
714 |
}
|
715 |
|
716 |
private void appendIndent(final int depth) throws IOException {
|
717 |
if (!removeLineBreaks) for (int x=0; x<depth; x++) appendable.append(indentString);
|
718 |
}
|
719 |
|
720 |
private void appendFormattingNewLine() throws IOException {
|
721 |
if (!removeLineBreaks) appendable.append(newLine);
|
722 |
}
|
723 |
|
724 |
private void appendEssentialNewLine() throws IOException {
|
725 |
appendable.append(newLine);
|
726 |
}
|
727 |
|
728 |
private boolean containsOnlyInlineLevelChildElements(final Element element) {
|
729 |
// returns true if the element contains only inline-level elements except for SCRIPT elements.
|
730 |
final Collection<Element> childElements=element.getChildElements();
|
731 |
if (childElements.isEmpty()) return true;
|
732 |
for (Element childElement : childElements) {
|
733 |
final String elementName=childElement.getName();
|
734 |
if (elementName==HTMLElementName.SCRIPT || !HTMLElements.getInlineLevelElementNames().contains(elementName)) return false;
|
735 |
if (!containsOnlyInlineLevelChildElements(childElement)) return false;
|
736 |
}
|
737 |
return true;
|
738 |
}
|
739 |
}
|
740 |
}
|