1 |
/**
|
2 |
* aagtl Advanced Geocaching Tool for Android
|
3 |
* loosely based on agtl by Daniel Fett <fett@danielfett.de>
|
4 |
* Copyright (C) 2010 - 2012 Zoff <aagtl@work.zoff.cc>
|
5 |
*
|
6 |
* This program is free software; you can redistribute it and/or
|
7 |
* modify it under the terms of the GNU General Public License
|
8 |
* version 2 as published by the Free Software Foundation.
|
9 |
*
|
10 |
* This program is distributed in the hope that it will be useful,
|
11 |
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12 |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
13 |
* GNU General Public License for more details.
|
14 |
*
|
15 |
* You should have received a copy of the GNU General Public License
|
16 |
* along with this program; if not, write to the
|
17 |
* Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
|
18 |
* Boston, MA 02110-1301, USA.
|
19 |
*/
|
20 |
|
21 |
/** Copyright (c) 2008 Google Inc.
|
22 |
*
|
23 |
* Licensed under the Apache License, Version 2.0 (the "License");
|
24 |
* you may not use this file except in compliance with the License.
|
25 |
* You may obtain a copy of the License at
|
26 |
*
|
27 |
* http://www.apache.org/licenses/LICENSE-2.0
|
28 |
*
|
29 |
* Unless required by applicable law or agreed to in writing, software
|
30 |
* distributed under the License is distributed on an "AS IS" BASIS,
|
31 |
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
32 |
* See the License for the specific language governing permissions and
|
33 |
* limitations under the License.
|
34 |
*/
|
35 |
|
36 |
package com.zoffcc.applications.aagtl;
|
37 |
|
38 |
import java.util.regex.Pattern;
|
39 |
|
40 |
/**
|
41 |
* Convert provided html formatted string to text format.
|
42 |
*
|
43 |
*
|
44 |
*/
|
45 |
public final class HtmlToText
|
46 |
{
|
47 |
|
48 |
/**
|
49 |
* Regular expression to match html line breaks or paragraph tags
|
50 |
* and adjacent whitespace
|
51 |
*/
|
52 |
private static final Pattern htmlNewlinePattern = Pattern.compile("\\s*<(br|/?p)>\\s*");
|
53 |
|
54 |
/** Regular expression to match list tags and adjacent whitespace */
|
55 |
private static final Pattern htmlListPattern = Pattern.compile("\\s*<li>\\s*");
|
56 |
|
57 |
/** Regular expression to match any remaining html tags */
|
58 |
private static final Pattern htmlTagPattern = Pattern.compile("</?([^<]*)>");
|
59 |
|
60 |
/** Maximum length of a line in email body (in characters) */
|
61 |
public static final int EMAIL_LINE_WIDTH_MAX = 72;
|
62 |
|
63 |
// This class should not be instantiated, hence the private constructor
|
64 |
private HtmlToText()
|
65 |
{
|
66 |
}
|
67 |
|
68 |
/**
|
69 |
* Convert provided html string to plain text preserving the formatting
|
70 |
* as much as possible. Ensure line wrapping to 72 chars as default.
|
71 |
* NOTE: add support for more HTML tags here.
|
72 |
* For the present, convert <br>
|
73 |
* to '\n'
|
74 |
* convert
|
75 |
* <p>
|
76 |
* and
|
77 |
* </p>
|
78 |
* to '\n'
|
79 |
* convert <li>to "\n- "
|
80 |
*
|
81 |
* @throws NullPointerException
|
82 |
*/
|
83 |
public static String htmlToPlainText(String html)
|
84 |
{
|
85 |
|
86 |
if (html == null)
|
87 |
{
|
88 |
throw new NullPointerException("Html parameter may not be null.");
|
89 |
}
|
90 |
|
91 |
// Clear any html indentation and incidental whitespace
|
92 |
String text = StringUtils.stripAndCollapse(html);
|
93 |
|
94 |
/*
|
95 |
* Replace <br> and <p> tags with new line characters.
|
96 |
* Replace <li> tags (HTML bullets) with dashes.
|
97 |
* Remove any remaining HTML tags not supported yet.
|
98 |
* Finally replace any HTML escape string with appropriate character
|
99 |
*/
|
100 |
text = htmlNewlinePattern.matcher(text).replaceAll("\n");
|
101 |
text = htmlListPattern.matcher(text).replaceAll("\n- ");
|
102 |
text = htmlTagPattern.matcher(text).replaceAll("");
|
103 |
text = StringUtils.unescapeHTML(text, 0).trim();
|
104 |
|
105 |
/*
|
106 |
* Ensure no line of plain text is longer than default (72 chars)
|
107 |
* NOTE: Use String.split, NOT StringUtil.split, in order to preserve
|
108 |
* consecutive newline characters originating from <br> and <p> tags
|
109 |
*/
|
110 |
return StringUtils.fixedWidth(text.split("\n"), EMAIL_LINE_WIDTH_MAX);
|
111 |
}
|
112 |
}
|