/[aagtl_public1]/src/net/htmlparser/jericho/Util.java
aagtl

Contents of /src/net/htmlparser/jericho/Util.java

Parent Directory Parent Directory | Revision Log Revision Log


Revision 2 - (show annotations) (download)
Sun Aug 5 13:48:36 2012 UTC (11 years, 7 months ago) by zoffadmin
File size: 5778 byte(s)
initial import of aagtl source code
1 // Jericho HTML Parser - Java based library for analysing and manipulating HTML
2 // Version 3.2
3 // Copyright (C) 2004-2009 Martin Jericho
4 // http://jericho.htmlparser.net/
5 //
6 // This library is free software; you can redistribute it and/or
7 // modify it under the terms of either one of the following licences:
8 //
9 // 1. The Eclipse Public License (EPL) version 1.0,
10 // included in this distribution in the file licence-epl-1.0.html
11 // or available at http://www.eclipse.org/legal/epl-v10.html
12 //
13 // 2. The GNU Lesser General Public License (LGPL) version 2.1 or later,
14 // included in this distribution in the file licence-lgpl-2.1.txt
15 // or available at http://www.gnu.org/licenses/lgpl.txt
16 //
17 // This library is distributed on an "AS IS" basis,
18 // WITHOUT WARRANTY OF ANY KIND, either express or implied.
19 // See the individual licence texts for more details.
20
21 package net.htmlparser.jericho;
22
23 import java.util.*;
24 import java.io.*;
25
26 /**
27 * Contains miscellaneous utility methods not directly associated with the HTML Parser library.
28 */
29 public final class Util {
30 private static final int BUFFER_SIZE=2048;
31 private static final String CSVNewLine=System.getProperty("line.separator");
32
33 private Util() {}
34
35 /**
36 * Returns the text loaded from the specified <code>Reader</code> as a string.
37 * <p>
38 * If a <code>null</code> argument is supplied to this method, an empty string is returned.
39 * <p>
40 * To load text from an <code>InputStream</code>, use <code>getString(new InputStreamReader(inputStream,encoding))</code>.
41 *
42 * @param reader the <code>java.io.Reader</code> from which to load the text.
43 * @return the text loaded from the specified <code>java.io.Reader</code> as a string.
44 * @throws java.io.IOException if an I/O error occurs.
45 */
46 public static String getString(final Reader reader) throws IOException {
47 if (reader==null) return "";
48 try {
49 int charsRead;
50 final char[] copyBuffer=new char[BUFFER_SIZE];
51 final StringBuilder sb=new StringBuilder();
52 while ((charsRead=reader.read(copyBuffer,0,BUFFER_SIZE))!=-1)
53 sb.append(copyBuffer,0,charsRead);
54 return sb.toString();
55 } finally {
56 reader.close();
57 }
58 }
59
60 /**
61 * Outputs the specified array of strings to the specified <code>Writer</code> in the format of a line for a CSV file.
62 * <p>
63 * "CSV" stands for <i>Comma Separated Values</i>.
64 * There is no formal specification for a CSV file, so there is significant variation in
65 * the way different applications handle issues like the encoding of different data types and special characters.
66 * <p>
67 * Generally, a CSV file contains a list of records separated by line breaks, with each record consisting of a list of
68 * field values separated by commas.
69 * Each record in the file should contain the same number of field values, with the values at each position representing the same
70 * type of data in all the records. In this way the file can also be divided into columns, often with the first line of the
71 * file containing the column labels.
72 * <p>
73 * Columns can have different data types such as text, numeric, date / time and boolean.
74 * A text value is often delimited with single (<code>'</code>) or double-quotes (<code>"</code>),
75 * especially if the value contains a comma, line feed, or other special character that is significant to the syntax.
76 * Encoding techniques for including quote characters themselves in text values vary widely.
77 * Values of other types are generally unquoted to distinguish them from text values.
78 * <p>
79 * This method produces output that is readable by MS-Excel, conforming to the following rules:
80 * <p>
81 * <ul>
82 * <li>All values are considered to be of type text, except for the static constants {@link Config#ColumnValueTrue}
83 * and {@link Config#ColumnValueFalse}, representing the boolean values <code>true</code> and <code>false</code> respectively.
84 * <li>All text values are enclosed in double-quotes.
85 * <li>Double-quote characters contained in text values are encoded using two consecutive double-quotes (<code>""</code>).
86 * <li><code>null</code> values are represented as empty fields.
87 * <li>The end of each record is represented by a carriage-return / line-feed (CR/LF) pair.
88 * <li>Line breaks inside text values are represented by a single line feed (LF) character.
89 * </ul>
90 *
91 * @param writer the destination <code>java.io.Writer</code> for the output.
92 * @throws java.io.IOException if an I/O error occurs.
93 * @see FormFields#getColumnLabels()
94 * @see FormFields#getColumnValues(Map)
95 */
96 public static void outputCSVLine(final Writer writer, final String[] values) throws IOException {
97 for (int i=0; i<values.length;) {
98 final String value=values[i];
99 if (value!=null) {
100 if (value==Config.ColumnValueTrue || value==Config.ColumnValueFalse) {
101 writer.write(value); // assumes neither ColumnTrue or ColumnFalse contain double quotes.
102 } else {
103 writer.write('"');
104 outputValueEscapeQuotes(writer,value);
105 writer.write('"');
106 }
107 }
108 if (++i!=values.length) writer.write(',');
109 }
110 writer.write(CSVNewLine);
111 }
112
113 private static void outputValueEscapeQuotes(final Writer writer, final String text) throws IOException {
114 for (int i=0; i<text.length(); i++) {
115 final char ch=text.charAt(i);
116 writer.write(ch);
117 if (ch=='"') writer.write(ch);
118 }
119 }
120
121 static char[] getConcatenatedCharArray(final String string1, final String string2) {
122 final char[] charArray=new char[string1.length()+string2.length()];
123 string1.getChars(0,string1.length(),charArray,0);
124 string2.getChars(0,string2.length(),charArray,string1.length());
125 return charArray;
126 }
127 }

   
Visit the aagtl Website