001 /*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements. See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License. You may obtain a copy of the License at
008 *
009 * http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017 package org.apache.commons.lang3;
018
019 import java.io.IOException;
020 import java.io.Writer;
021
022 import org.apache.commons.lang3.text.translate.AggregateTranslator;
023 import org.apache.commons.lang3.text.translate.CharSequenceTranslator;
024 import org.apache.commons.lang3.text.translate.EntityArrays;
025 import org.apache.commons.lang3.text.translate.LookupTranslator;
026 import org.apache.commons.lang3.text.translate.NumericEntityUnescaper;
027 import org.apache.commons.lang3.text.translate.UnicodeEscaper;
028 import org.apache.commons.lang3.text.translate.UnicodeUnescaper;
029
030 /**
031 * <p>Escapes and unescapes <code>String</code>s for
032 * Java, Java Script, HTML and XML.</p>
033 *
034 * <p>#ThreadSafe#</p>
035 * @author Apache Software Foundation
036 * @author Apache Jakarta Turbine
037 * @author Purple Technology
038 * @author <a href="mailto:alex@purpletech.com">Alexander Day Chaffee</a>
039 * @author Antony Riley
040 * @author Helge Tesgaard
041 * @author <a href="sean@boohai.com">Sean Brown</a>
042 * @author <a href="mailto:ggregory@seagullsw.com">Gary Gregory</a>
043 * @author Phil Steitz
044 * @author Pete Gieser
045 * @since 2.0
046 * @version $Id: StringEscapeUtils.java 918868 2010-03-04 06:22:16Z bayard $
047 */
048 public class StringEscapeUtils {
049
050 /* ESCAPE TRANSLATORS */
051
052 public static final CharSequenceTranslator ESCAPE_JAVA =
053 new LookupTranslator(
054 new String[][] {
055 {"\"", "\\\""},
056 {"\\", "\\\\"},
057 }).with(
058 new LookupTranslator(EntityArrays.JAVA_CTRL_CHARS_ESCAPE())
059 ).with(
060 UnicodeEscaper.outsideOf(32, 0x7f)
061 );
062
063 public static final CharSequenceTranslator ESCAPE_ECMASCRIPT =
064 new AggregateTranslator(
065 new LookupTranslator(
066 new String[][] {
067 {"'", "\\'"},
068 {"\"", "\\\""},
069 {"\\", "\\\\"},
070 {"/", "\\/"}
071 }),
072 new LookupTranslator(EntityArrays.JAVA_CTRL_CHARS_ESCAPE()),
073 UnicodeEscaper.outsideOf(32, 0x7f)
074 );
075
076 public static final CharSequenceTranslator ESCAPE_XML =
077 new AggregateTranslator(
078 new LookupTranslator(EntityArrays.BASIC_ESCAPE()),
079 new LookupTranslator(EntityArrays.APOS_ESCAPE())
080 );
081
082 public static final CharSequenceTranslator ESCAPE_HTML3 =
083 new AggregateTranslator(
084 new LookupTranslator(EntityArrays.BASIC_ESCAPE()),
085 new LookupTranslator(EntityArrays.ISO8859_1_ESCAPE())
086 );
087
088 public static final CharSequenceTranslator ESCAPE_HTML4 =
089 new AggregateTranslator(
090 new LookupTranslator(EntityArrays.BASIC_ESCAPE()),
091 new LookupTranslator(EntityArrays.ISO8859_1_ESCAPE()),
092 new LookupTranslator(EntityArrays.HTML40_EXTENDED_ESCAPE())
093 );
094
095 public static final CharSequenceTranslator ESCAPE_CSV = new CsvEscaper();
096
097 // TODO: Create a parent class - 'SinglePassTranslator' ?
098 // TODO: It would handle the index checking, and length returning, and
099 // TODO: could also have an optimization check method.
100 static class CsvEscaper extends CharSequenceTranslator {
101
102 private static final char CSV_DELIMITER = ',';
103 private static final char CSV_QUOTE = '"';
104 private static final String CSV_QUOTE_STR = String.valueOf(CSV_QUOTE);
105 private static final char[] CSV_SEARCH_CHARS = new char[] {CSV_DELIMITER, CSV_QUOTE, CharUtils.CR, CharUtils.LF};
106
107 // TODO: Replace with a RegexTranslator. That should consume the number of characters the regex uses up?
108 @Override
109 public int translate(CharSequence input, int index, Writer out) throws IOException {
110
111 if(index != 0) {
112 throw new IllegalStateException("CsvEscaper should never reach the [1] index");
113 }
114
115 if (StringUtils.containsNone(input.toString(), CSV_SEARCH_CHARS)) {
116 out.write(input.toString());
117 } else {
118 out.write(CSV_QUOTE);
119 out.write(StringUtils.replace(input.toString(), CSV_QUOTE_STR, CSV_QUOTE_STR + CSV_QUOTE_STR));
120 out.write(CSV_QUOTE);
121 }
122 return input.length();
123 }
124 }
125
126 /* UNESCAPE TRANSLATORS */
127
128 // TODO: throw "illegal character: \92" as an Exception if a \ on the end of the Java (as per the compiler)?
129 public static final CharSequenceTranslator UNESCAPE_JAVA =
130 new AggregateTranslator(
131 new UnicodeUnescaper(),
132 new LookupTranslator(EntityArrays.JAVA_CTRL_CHARS_UNESCAPE()),
133 new LookupTranslator(
134 new String[][] {
135 {"\\\\", "\\"},
136 {"\\\"", "\""},
137 {"\\'", "'"},
138 {"\\", ""}
139 })
140 );
141
142 public static final CharSequenceTranslator UNESCAPE_ECMASCRIPT = UNESCAPE_JAVA;
143
144 public static final CharSequenceTranslator UNESCAPE_HTML3 =
145 new AggregateTranslator(
146 new LookupTranslator(EntityArrays.BASIC_UNESCAPE()),
147 new LookupTranslator(EntityArrays.ISO8859_1_UNESCAPE()),
148 new NumericEntityUnescaper()
149 );
150
151 public static final CharSequenceTranslator UNESCAPE_HTML4 =
152 new AggregateTranslator(
153 new LookupTranslator(EntityArrays.BASIC_UNESCAPE()),
154 new LookupTranslator(EntityArrays.ISO8859_1_UNESCAPE()),
155 new LookupTranslator(EntityArrays.HTML40_EXTENDED_UNESCAPE()),
156 new NumericEntityUnescaper()
157 );
158
159 public static final CharSequenceTranslator UNESCAPE_XML =
160 new AggregateTranslator(
161 new LookupTranslator(EntityArrays.BASIC_UNESCAPE()),
162 new LookupTranslator(EntityArrays.APOS_UNESCAPE()),
163 new NumericEntityUnescaper()
164 );
165
166 public static final CharSequenceTranslator UNESCAPE_CSV = new CsvUnescaper();
167
168 static class CsvUnescaper extends CharSequenceTranslator {
169
170 private static final char CSV_DELIMITER = ',';
171 private static final char CSV_QUOTE = '"';
172 private static final String CSV_QUOTE_STR = String.valueOf(CSV_QUOTE);
173 private static final char[] CSV_SEARCH_CHARS = new char[] {CSV_DELIMITER, CSV_QUOTE, CharUtils.CR, CharUtils.LF};
174
175 // TODO: Replace with a RegexTranslator. That should consume the number of characters the regex uses up?
176 @Override
177 public int translate(CharSequence input, int index, Writer out) throws IOException {
178
179 if(index != 0) {
180 throw new IllegalStateException("CsvUnescaper should never reach the [1] index");
181 }
182
183 if ( input.charAt(0) != CSV_QUOTE || input.charAt(input.length() - 1) != CSV_QUOTE ) {
184 out.write(input.toString());
185 return input.length();
186 }
187
188 // strip quotes
189 String quoteless = input.subSequence(1, input.length() - 1).toString();
190
191 if ( StringUtils.containsAny(quoteless, CSV_SEARCH_CHARS) ) {
192 // deal with escaped quotes; ie) ""
193 out.write(StringUtils.replace(quoteless, CSV_QUOTE_STR + CSV_QUOTE_STR, CSV_QUOTE_STR));
194 } else {
195 out.write(input.toString());
196 }
197 return input.length();
198 }
199 }
200
201 /* Helper functions */
202
203 /**
204 * <p><code>StringEscapeUtils</code> instances should NOT be constructed in
205 * standard programming.</p>
206 *
207 * <p>Instead, the class should be used as:
208 * <pre>StringEscapeUtils.escapeJava("foo");</pre></p>
209 *
210 * <p>This constructor is public to permit tools that require a JavaBean
211 * instance to operate.</p>
212 */
213 public StringEscapeUtils() {
214 super();
215 }
216
217 // Java and JavaScript
218 //--------------------------------------------------------------------------
219 /**
220 * <p>Escapes the characters in a <code>String</code> using Java String rules.</p>
221 *
222 * <p>Deals correctly with quotes and control-chars (tab, backslash, cr, ff, etc.) </p>
223 *
224 * <p>So a tab becomes the characters <code>'\\'</code> and
225 * <code>'t'</code>.</p>
226 *
227 * <p>The only difference between Java strings and JavaScript strings
228 * is that in JavaScript, a single quote and forward-slash (/) are escaped.</p>
229 *
230 * <p>Example:
231 * <pre>
232 * input string: He didn't say, "Stop!"
233 * output string: He didn't say, \"Stop!\"
234 * </pre>
235 * </p>
236 *
237 * @param input String to escape values in, may be null
238 * @return String with escaped values, <code>null</code> if null string input
239 */
240 public static final String escapeJava(String input) {
241 return ESCAPE_JAVA.translate(input);
242 }
243
244 /**
245 * <p>Escapes the characters in a <code>String</code> using EcmaScript String rules.</p>
246 * <p>Escapes any values it finds into their EcmaScript String form.
247 * Deals correctly with quotes and control-chars (tab, backslash, cr, ff, etc.) </p>
248 *
249 * <p>So a tab becomes the characters <code>'\\'</code> and
250 * <code>'t'</code>.</p>
251 *
252 * <p>The only difference between Java strings and EcmaScript strings
253 * is that in EcmaScript, a single quote and forward-slash (/) are escaped.</p>
254 *
255 * <p>Note that EcmaScript is best known by the JavaScript and ActionScript dialects. </p>
256 *
257 * <p>Example:
258 * <pre>
259 * input string: He didn't say, "Stop!"
260 * output string: He didn\'t say, \"Stop!\"
261 * </pre>
262 * </p>
263 *
264 * @param input String to escape values in, may be null
265 * @return String with escaped values, <code>null</code> if null string input
266 */
267 public static final String escapeEcmaScript(String input) {
268 return ESCAPE_ECMASCRIPT.translate(input);
269 }
270
271 /**
272 * <p>Unescapes any Java literals found in the <code>String</code>.
273 * For example, it will turn a sequence of <code>'\'</code> and
274 * <code>'n'</code> into a newline character, unless the <code>'\'</code>
275 * is preceded by another <code>'\'</code>.</p>
276 *
277 * @param input the <code>String</code> to unescape, may be null
278 * @return a new unescaped <code>String</code>, <code>null</code> if null string input
279 */
280 public static final String unescapeJava(String input) {
281 return UNESCAPE_JAVA.translate(input);
282 }
283
284 /**
285 * <p>Unescapes any EcmaScript literals found in the <code>String</code>.</p>
286 *
287 * <p>For example, it will turn a sequence of <code>'\'</code> and <code>'n'</code>
288 * into a newline character, unless the <code>'\'</code> is preceded by another
289 * <code>'\'</code>.</p>
290 *
291 * @see #unescapeJava(String)
292 * @param input the <code>String</code> to unescape, may be null
293 * @return A new unescaped <code>String</code>, <code>null</code> if null string input
294 */
295 public static final String unescapeEcmaScript(String input) {
296 return UNESCAPE_ECMASCRIPT.translate(input);
297 }
298
299 // HTML and XML
300 //--------------------------------------------------------------------------
301 /**
302 * <p>Escapes the characters in a <code>String</code> using HTML entities.</p>
303 *
304 * <p>
305 * For example:
306 * </p>
307 * <p><code>"bread" & "butter"</code></p>
308 * becomes:
309 * <p>
310 * <code>&quot;bread&quot; &amp; &quot;butter&quot;</code>.
311 * </p>
312 *
313 * <p>Supports all known HTML 4.0 entities, including funky accents.
314 * Note that the commonly used apostrophe escape character (&apos;)
315 * is not a legal entity and so is not supported). </p>
316 *
317 * @param input the <code>String</code> to escape, may be null
318 * @return a new escaped <code>String</code>, <code>null</code> if null string input
319 *
320 * @see <a href="http://hotwired.lycos.com/webmonkey/reference/special_characters/">ISO Entities</a>
321 * @see <a href="http://www.w3.org/TR/REC-html32#latin1">HTML 3.2 Character Entities for ISO Latin-1</a>
322 * @see <a href="http://www.w3.org/TR/REC-html40/sgml/entities.html">HTML 4.0 Character entity references</a>
323 * @see <a href="http://www.w3.org/TR/html401/charset.html#h-5.3">HTML 4.01 Character References</a>
324 * @see <a href="http://www.w3.org/TR/html401/charset.html#code-position">HTML 4.01 Code positions</a>
325 */
326 public static final String escapeHtml4(String input) {
327 return ESCAPE_HTML4.translate(input);
328 }
329
330 public static final String escapeHtml3(String input) {
331 return ESCAPE_HTML3.translate(input);
332 }
333
334 //-----------------------------------------------------------------------
335 /**
336 * <p>Unescapes a string containing entity escapes to a string
337 * containing the actual Unicode characters corresponding to the
338 * escapes. Supports HTML 4.0 entities.</p>
339 *
340 * <p>For example, the string "&lt;Fran&ccedil;ais&gt;"
341 * will become "<Français>"</p>
342 *
343 * <p>If an entity is unrecognized, it is left alone, and inserted
344 * verbatim into the result string. e.g. "&gt;&zzzz;x" will
345 * become ">&zzzz;x".</p>
346 *
347 * @param input the <code>String</code> to unescape, may be null
348 * @return a new unescaped <code>String</code>, <code>null</code> if null string input
349 */
350 public static final String unescapeHtml4(String input) {
351 return UNESCAPE_HTML4.translate(input);
352 }
353
354 public static final String unescapeHtml3(String input) {
355 return UNESCAPE_HTML3.translate(input);
356 }
357
358 //-----------------------------------------------------------------------
359 /**
360 * <p>Escapes the characters in a <code>String</code> using XML entities.</p>
361 *
362 * <p>For example: <tt>"bread" & "butter"</tt> =>
363 * <tt>&quot;bread&quot; &amp; &quot;butter&quot;</tt>.
364 * </p>
365 *
366 * <p>Supports only the five basic XML entities (gt, lt, quot, amp, apos).
367 * Does not support DTDs or external entities.</p>
368 *
369 * <p>Note that unicode characters greater than 0x7f are as of 3.0, no longer
370 * escaped. </p>
371 *
372 * @param input the <code>String</code> to escape, may be null
373 * @return a new escaped <code>String</code>, <code>null</code> if null string input
374 * @see #unescapeXml(java.lang.String)
375 */
376 public static final String escapeXml(String input) {
377 return ESCAPE_XML.translate(input);
378 }
379
380
381 //-----------------------------------------------------------------------
382 /**
383 * <p>Unescapes a string containing XML entity escapes to a string
384 * containing the actual Unicode characters corresponding to the
385 * escapes.</p>
386 *
387 * <p>Supports only the five basic XML entities (gt, lt, quot, amp, apos).
388 * Does not support DTDs or external entities.</p>
389 *
390 * <p>Note that numerical \\u unicode codes are unescaped to their respective
391 * unicode characters. This may change in future releases. </p>
392 *
393 * @param input the <code>String</code> to unescape, may be null
394 * @return a new unescaped <code>String</code>, <code>null</code> if null string input
395 * @see #escapeXml(String)
396 */
397 public static final String unescapeXml(String input) {
398 return UNESCAPE_XML.translate(input);
399 }
400
401
402 //-----------------------------------------------------------------------
403
404 /**
405 * <p>Returns a <code>String</code> value for a CSV column enclosed in double quotes,
406 * if required.</p>
407 *
408 * <p>If the value contains a comma, newline or double quote, then the
409 * String value is returned enclosed in double quotes.</p>
410 * </p>
411 *
412 * <p>Any double quote characters in the value are escaped with another double quote.</p>
413 *
414 * <p>If the value does not contain a comma, newline or double quote, then the
415 * String value is returned unchanged.</p>
416 * </p>
417 *
418 * see <a href="http://en.wikipedia.org/wiki/Comma-separated_values">Wikipedia</a> and
419 * <a href="http://tools.ietf.org/html/rfc4180">RFC 4180</a>.
420 *
421 * @param input the input CSV column String, may be null
422 * @return the input String, enclosed in double quotes if the value contains a comma,
423 * newline or double quote, <code>null</code> if null string input
424 * @since 2.4
425 */
426 public static final String escapeCsv(String input) {
427 return ESCAPE_CSV.translate(input);
428 }
429
430 /**
431 * <p>Returns a <code>String</code> value for an unescaped CSV column. </p>
432 *
433 * <p>If the value is enclosed in double quotes, and contains a comma, newline
434 * or double quote, then quotes are removed.
435 * </p>
436 *
437 * <p>Any double quote escaped characters (a pair of double quotes) are unescaped
438 * to just one double quote. </p>
439 *
440 * <p>If the value is not enclosed in double quotes, or is and does not contain a
441 * comma, newline or double quote, then the String value is returned unchanged.</p>
442 * </p>
443 *
444 * see <a href="http://en.wikipedia.org/wiki/Comma-separated_values">Wikipedia</a> and
445 * <a href="http://tools.ietf.org/html/rfc4180">RFC 4180</a>.
446 *
447 * @param input the input CSV column String, may be null
448 * @return the input String, with enclosing double quotes removed and embedded double
449 * quotes unescaped, <code>null</code> if null string input
450 * @since 2.4
451 */
452 public static final String unescapeCsv(String input) {
453 return UNESCAPE_CSV.translate(input);
454 }
455
456 }