001 package hirondelle.web4j.util;
002
003 import java.net.URLEncoder;
004 import java.io.UnsupportedEncodingException;
005 import java.text.CharacterIterator;
006 import java.text.StringCharacterIterator;
007 import java.util.regex.Pattern;
008 import java.util.regex.Matcher;
009
010 import hirondelle.web4j.security.SafeText;
011 import hirondelle.web4j.ui.translate.Text;
012 import hirondelle.web4j.ui.translate.Tooltips;
013 import hirondelle.web4j.ui.translate.TextFlow;
014 import hirondelle.web4j.ui.tag.Populate;
015 import hirondelle.web4j.database.Report;
016
017 /**
018 Convenience methods for escaping special characters related to HTML, XML,
019 and regular expressions.
020
021 <P>To keep you safe by default, WEB4J goes to some effort to escape
022 characters in your data when appropriate, such that you <em>usually</em>
023 don't need to think too much about escaping special characters. Thus, you
024 shouldn't need to <em>directly</em> use the services of this class very often.
025
026 <P><span class='highlight'>For Model Objects containing free form user input,
027 it is highly recommended that you use {@link SafeText}, not <tt>String</tt></span>.
028 Free form user input is open to malicious use, such as
029 <a href='http://www.owasp.org/index.php/Cross_Site_Scripting'>Cross Site Scripting</a>
030 attacks.
031 Using <tt>SafeText</tt> will protect you from such attacks, by always escaping
032 special characters automatically in its <tt>toString()</tt> method.
033
034 <P>The following WEB4J classes will automatically escape special characters
035 for you, when needed :
036 <ul>
037 <li>the {@link SafeText} class, used as a building block class for your
038 application's Model Objects, for modeling all free form user input
039 <li>the {@link Populate} tag used with forms
040 <li>the {@link Report} class used for creating quick reports
041 <li>the {@link Text}, {@link TextFlow}, and {@link Tooltips} custom tags used
042 for translation
043 </ul>
044 */
045 public final class EscapeChars {
046
047 /**
048 Escape characters for text appearing in HTML markup.
049
050 <P>This method exists as a defence against Cross Site Scripting (XSS) hacks.
051 The idea is to neutralize control characters commonly used by scripts, such that
052 they will not be executed by the browser. This is done by replacing the control
053 characters with their escaped equivalents.
054 See {@link hirondelle.web4j.security.SafeText} as well.
055
056 <P>The following characters are replaced with corresponding
057 HTML character entities :
058 <table border='1' cellpadding='3' cellspacing='0'>
059 <tr><th> Character </th><th>Replacement</th></tr>
060 <tr><td> < </td><td> &lt; </td></tr>
061 <tr><td> > </td><td> &gt; </td></tr>
062 <tr><td> & </td><td> &amp; </td></tr>
063 <tr><td> " </td><td> &quot;</td></tr>
064 <tr><td> \t </td><td> &#009;</td></tr>
065 <tr><td> ! </td><td> &#033;</td></tr>
066 <tr><td> # </td><td> &#035;</td></tr>
067 <tr><td> $ </td><td> &#036;</td></tr>
068 <tr><td> % </td><td> &#037;</td></tr>
069 <tr><td> ' </td><td> &#039;</td></tr>
070 <tr><td> ( </td><td> &#040;</td></tr>
071 <tr><td> ) </td><td> &#041;</td></tr>
072 <tr><td> * </td><td> &#042;</td></tr>
073 <tr><td> + </td><td> &#043; </td></tr>
074 <tr><td> , </td><td> &#044; </td></tr>
075 <tr><td> - </td><td> &#045; </td></tr>
076 <tr><td> . </td><td> &#046; </td></tr>
077 <tr><td> / </td><td> &#047; </td></tr>
078 <tr><td> : </td><td> &#058;</td></tr>
079 <tr><td> ; </td><td> &#059;</td></tr>
080 <tr><td> = </td><td> &#061;</td></tr>
081 <tr><td> ? </td><td> &#063;</td></tr>
082 <tr><td> @ </td><td> &#064;</td></tr>
083 <tr><td> [ </td><td> &#091;</td></tr>
084 <tr><td> \ </td><td> &#092;</td></tr>
085 <tr><td> ] </td><td> &#093;</td></tr>
086 <tr><td> ^ </td><td> &#094;</td></tr>
087 <tr><td> _ </td><td> &#095;</td></tr>
088 <tr><td> ` </td><td> &#096;</td></tr>
089 <tr><td> { </td><td> &#123;</td></tr>
090 <tr><td> | </td><td> &#124;</td></tr>
091 <tr><td> } </td><td> &#125;</td></tr>
092 <tr><td> ~ </td><td> &#126;</td></tr>
093 </table>
094
095 <P>Note that JSTL's {@code <c:out>} escapes <em>only the first
096 five</em> of the above characters.
097 */
098 public static String forHTML(String aText){
099 final StringBuilder result = new StringBuilder();
100 final StringCharacterIterator iterator = new StringCharacterIterator(aText);
101 char character = iterator.current();
102 while (character != CharacterIterator.DONE ){
103 if (character == '<') {
104 result.append("<");
105 }
106 else if (character == '>') {
107 result.append(">");
108 }
109 else if (character == '&') {
110 result.append("&");
111 }
112 else if (character == '\"') {
113 result.append(""");
114 }
115 else if (character == '\t') {
116 addCharEntity(9, result);
117 }
118 else if (character == '!') {
119 addCharEntity(33, result);
120 }
121 else if (character == '#') {
122 addCharEntity(35, result);
123 }
124 else if (character == '$') {
125 addCharEntity(36, result);
126 }
127 else if (character == '%') {
128 addCharEntity(37, result);
129 }
130 else if (character == '\'') {
131 addCharEntity(39, result);
132 }
133 else if (character == '(') {
134 addCharEntity(40, result);
135 }
136 else if (character == ')') {
137 addCharEntity(41, result);
138 }
139 else if (character == '*') {
140 addCharEntity(42, result);
141 }
142 else if (character == '+') {
143 addCharEntity(43, result);
144 }
145 else if (character == ',') {
146 addCharEntity(44, result);
147 }
148 else if (character == '-') {
149 addCharEntity(45, result);
150 }
151 else if (character == '.') {
152 addCharEntity(46, result);
153 }
154 else if (character == '/') {
155 addCharEntity(47, result);
156 }
157 else if (character == ':') {
158 addCharEntity(58, result);
159 }
160 else if (character == ';') {
161 addCharEntity(59, result);
162 }
163 else if (character == '=') {
164 addCharEntity(61, result);
165 }
166 else if (character == '?') {
167 addCharEntity(63, result);
168 }
169 else if (character == '@') {
170 addCharEntity(64, result);
171 }
172 else if (character == '[') {
173 addCharEntity(91, result);
174 }
175 else if (character == '\\') {
176 addCharEntity(92, result);
177 }
178 else if (character == ']') {
179 addCharEntity(93, result);
180 }
181 else if (character == '^') {
182 addCharEntity(94, result);
183 }
184 else if (character == '_') {
185 addCharEntity(95, result);
186 }
187 else if (character == '`') {
188 addCharEntity(96, result);
189 }
190 else if (character == '{') {
191 addCharEntity(123, result);
192 }
193 else if (character == '|') {
194 addCharEntity(124, result);
195 }
196 else if (character == '}') {
197 addCharEntity(125, result);
198 }
199 else if (character == '~') {
200 addCharEntity(126, result);
201 }
202 else {
203 //the char is not a special one
204 //add it to the result as is
205 result.append(character);
206 }
207 character = iterator.next();
208 }
209 return result.toString();
210 }
211
212
213 /**
214 Escape all ampersand characters in a URL.
215
216 <P>Replaces all <tt>'&'</tt> characters with <tt>'&amp;'</tt>.
217
218 <P>An ampersand character may appear in the query string of a URL.
219 The ampersand character is indeed valid in a URL.
220 <em>However, URLs usually appear as an <tt>HREF</tt> attribute, and
221 such attributes have the additional constraint that ampersands
222 must be escaped.</em>
223
224 <P>The JSTL <c:url> tag does indeed perform proper URL encoding of
225 query parameters. But it does not, in general, produce text which
226 is valid as an <tt>HREF</tt> attribute, simply because it does
227 not escape the ampersand character. This is a nuisance when
228 multiple query parameters appear in the URL, since it requires a little
229 extra work.
230 */
231 public static String forHrefAmpersand(String aURL){
232 return aURL.replace("&", "&");
233 }
234
235 /**
236 Synonym for <tt>URLEncoder.encode(String, "UTF-8")</tt>.
237
238 <P>Used to ensure that HTTP query strings are in proper form, by escaping
239 special characters such as spaces.
240
241 <P>It is important to note that if a query string appears in an <tt>HREF</tt>
242 attribute, then there are two issues - ensuring the query string is valid HTTP
243 (it is URL-encoded), and ensuring it is valid HTML (ensuring the
244 ampersand is escaped).
245 */
246 public static String forURL(String aURLFragment){
247 String result = null;
248 try {
249 result = URLEncoder.encode(aURLFragment, "UTF-8");
250 }
251 catch (UnsupportedEncodingException ex){
252 throw new RuntimeException("UTF-8 not supported", ex);
253 }
254 return result;
255 }
256
257 /**
258 Escape characters for text appearing as XML data, between tags.
259
260 <P>The following characters are replaced with corresponding character entities :
261 <table border='1' cellpadding='3' cellspacing='0'>
262 <tr><th> Character </th><th> Encoding </th></tr>
263 <tr><td> < </td><td> &lt; </td></tr>
264 <tr><td> > </td><td> &gt; </td></tr>
265 <tr><td> & </td><td> &amp; </td></tr>
266 <tr><td> " </td><td> &quot;</td></tr>
267 <tr><td> ' </td><td> &#039;</td></tr>
268 </table>
269
270 <P>Note that JSTL's {@code <c:out>} escapes the exact same set of
271 characters as this method. <span class='highlight'>That is, {@code <c:out>}
272 is good for escaping to produce valid XML, but not for producing safe
273 HTML.</span>
274 */
275 public static String forXML(String aText){
276 final StringBuilder result = new StringBuilder();
277 final StringCharacterIterator iterator = new StringCharacterIterator(aText);
278 char character = iterator.current();
279 while (character != CharacterIterator.DONE ){
280 if (character == '<') {
281 result.append("<");
282 }
283 else if (character == '>') {
284 result.append(">");
285 }
286 else if (character == '\"') {
287 result.append(""");
288 }
289 else if (character == '\'') {
290 result.append("'");
291 }
292 else if (character == '&') {
293 result.append("&");
294 }
295 else {
296 //the char is not a special one
297 //add it to the result as is
298 result.append(character);
299 }
300 character = iterator.next();
301 }
302 return result.toString();
303 }
304
305 /**
306 Escapes characters for text appearing as data in the
307 <a href='http://www.json.org/'>Javascript Object Notation</a>
308 (JSON) data interchange format.
309
310 <P>The following commonly used control characters are escaped :
311 <table border='1' cellpadding='3' cellspacing='0'>
312 <tr><th> Character </th><th> Escaped As </th></tr>
313 <tr><td> " </td><td> \" </td></tr>
314 <tr><td> \ </td><td> \\ </td></tr>
315 <tr><td> / </td><td> \/ </td></tr>
316 <tr><td> back space </td><td> \b </td></tr>
317 <tr><td> form feed </td><td> \f </td></tr>
318 <tr><td> line feed </td><td> \n </td></tr>
319 <tr><td> carriage return </td><td> \r </td></tr>
320 <tr><td> tab </td><td> \t </td></tr>
321 </table>
322
323 <P>See <a href='http://www.ietf.org/rfc/rfc4627.txt'>RFC 4627</a> for more information.
324 */
325 public static String forJSON(String aText){
326 final StringBuilder result = new StringBuilder();
327 StringCharacterIterator iterator = new StringCharacterIterator(aText);
328 char character = iterator.current();
329 while (character != StringCharacterIterator.DONE){
330 if( character == '\"' ){
331 result.append("\\\"");
332 }
333 else if(character == '\\'){
334 result.append("\\\\");
335 }
336 else if(character == '/'){
337 result.append("\\/");
338 }
339 else if(character == '\b'){
340 result.append("\\b");
341 }
342 else if(character == '\f'){
343 result.append("\\f");
344 }
345 else if(character == '\n'){
346 result.append("\\n");
347 }
348 else if(character == '\r'){
349 result.append("\\r");
350 }
351 else if(character == '\t'){
352 result.append("\\t");
353 }
354 else {
355 //the char is not a special one
356 //add it to the result as is
357 result.append(character);
358 }
359 character = iterator.next();
360 }
361 return result.toString();
362 }
363
364 /**
365 Return <tt>aText</tt> with all <tt>'<'</tt> and <tt>'>'</tt> characters
366 replaced by their escaped equivalents.
367 */
368 public static String toDisableTags(String aText){
369 final StringBuilder result = new StringBuilder();
370 final StringCharacterIterator iterator = new StringCharacterIterator(aText);
371 char character = iterator.current();
372 while (character != CharacterIterator.DONE ){
373 if (character == '<') {
374 result.append("<");
375 }
376 else if (character == '>') {
377 result.append(">");
378 }
379 else {
380 //the char is not a special one
381 //add it to the result as is
382 result.append(character);
383 }
384 character = iterator.next();
385 }
386 return result.toString();
387 }
388
389
390 /**
391 Replace characters having special meaning in regular expressions
392 with their escaped equivalents, preceded by a '\' character.
393
394 <P>The escaped characters include :
395 <ul>
396 <li>.
397 <li>\
398 <li>?, * , and +
399 <li>&
400 <li>:
401 <li>{ and }
402 <li>[ and ]
403 <li>( and )
404 <li>^ and $
405 </ul>
406 */
407 public static String forRegex(String aRegexFragment){
408 final StringBuilder result = new StringBuilder();
409
410 final StringCharacterIterator iterator =
411 new StringCharacterIterator(aRegexFragment)
412 ;
413 char character = iterator.current();
414 while (character != CharacterIterator.DONE ){
415 /*
416 All literals need to have backslashes doubled.
417 */
418 if (character == '.') {
419 result.append("\\.");
420 }
421 else if (character == '\\') {
422 result.append("\\\\");
423 }
424 else if (character == '?') {
425 result.append("\\?");
426 }
427 else if (character == '*') {
428 result.append("\\*");
429 }
430 else if (character == '+') {
431 result.append("\\+");
432 }
433 else if (character == '&') {
434 result.append("\\&");
435 }
436 else if (character == ':') {
437 result.append("\\:");
438 }
439 else if (character == '{') {
440 result.append("\\{");
441 }
442 else if (character == '}') {
443 result.append("\\}");
444 }
445 else if (character == '[') {
446 result.append("\\[");
447 }
448 else if (character == ']') {
449 result.append("\\]");
450 }
451 else if (character == '(') {
452 result.append("\\(");
453 }
454 else if (character == ')') {
455 result.append("\\)");
456 }
457 else if (character == '^') {
458 result.append("\\^");
459 }
460 else if (character == '$') {
461 result.append("\\$");
462 }
463 else {
464 //the char is not a special one
465 //add it to the result as is
466 result.append(character);
467 }
468 character = iterator.next();
469 }
470 return result.toString();
471 }
472
473 /**
474 Escape <tt>'$'</tt> and <tt>'\'</tt> characters in replacement strings.
475
476 <P>Synonym for <tt>Matcher.quoteReplacement(String)</tt>.
477
478 <P>The following methods use replacement strings which treat
479 <tt>'$'</tt> and <tt>'\'</tt> as special characters:
480 <ul>
481 <li><tt>String.replaceAll(String, String)</tt>
482 <li><tt>String.replaceFirst(String, String)</tt>
483 <li><tt>Matcher.appendReplacement(StringBuffer, String)</tt>
484 </ul>
485
486 <P>If replacement text can contain arbitrary characters, then you
487 will usually need to escape that text, to ensure special characters
488 are interpreted literally.
489 */
490 public static String forReplacementString(String aInput){
491 return Matcher.quoteReplacement(aInput);
492 }
493
494 /**
495 Disable all <tt><SCRIPT></tt> tags in <tt>aText</tt>.
496
497 <P>Insensitive to case.
498 */
499 public static String forScriptTagsOnly(String aText){
500 String result = null;
501 Matcher matcher = SCRIPT.matcher(aText);
502 result = matcher.replaceAll("<SCRIPT>");
503 matcher = SCRIPT_END.matcher(result);
504 result = matcher.replaceAll("</SCRIPT>");
505 return result;
506 }
507
508 // PRIVATE //
509
510 private EscapeChars(){
511 //empty - prevent construction
512 }
513
514 private static final Pattern SCRIPT = Pattern.compile(
515 "<SCRIPT>", Pattern.CASE_INSENSITIVE
516 );
517 private static final Pattern SCRIPT_END = Pattern.compile(
518 "</SCRIPT>", Pattern.CASE_INSENSITIVE
519 );
520
521 private static void addCharEntity(Integer aIdx, StringBuilder aBuilder){
522 String padding = "";
523 if( aIdx <= 9 ){
524 padding = "00";
525 }
526 else if( aIdx <= 99 ){
527 padding = "0";
528 }
529 else {
530 //no prefix
531 }
532 String number = padding + aIdx.toString();
533 aBuilder.append("&#" + number + ";");
534 }
535 }