001 package hirondelle.web4j.util; 002 003 import java.net.URLEncoder; 004 import java.io.UnsupportedEncodingException; 005 import java.text.CharacterIterator; 006 import java.text.StringCharacterIterator; 007 import java.util.regex.Pattern; 008 import java.util.regex.Matcher; 009 010 import hirondelle.web4j.security.SafeText; 011 import hirondelle.web4j.ui.translate.Text; 012 import hirondelle.web4j.ui.translate.Tooltips; 013 import hirondelle.web4j.ui.translate.TextFlow; 014 import hirondelle.web4j.ui.tag.Populate; 015 import hirondelle.web4j.database.Report; 016 017 /** 018 Convenience methods for escaping special characters related to HTML, XML, 019 and regular expressions. 020 021 <P>To keep you safe by default, WEB4J goes to some effort to escape 022 characters in your data when appropriate, such that you <em>usually</em> 023 don't need to think too much about escaping special characters. Thus, you 024 shouldn't need to <em>directly</em> use the services of this class very often. 025 026 <P><span class='highlight'>For Model Objects containing free form user input, 027 it is highly recommended that you use {@link SafeText}, not <tt>String</tt></span>. 028 Free form user input is open to malicious use, such as 029 <a href='http://www.owasp.org/index.php/Cross_Site_Scripting'>Cross Site Scripting</a> 030 attacks. 031 Using <tt>SafeText</tt> will protect you from such attacks, by always escaping 032 special characters automatically in its <tt>toString()</tt> method. 033 034 <P>The following WEB4J classes will automatically escape special characters 035 for you, when needed : 036 <ul> 037 <li>the {@link SafeText} class, used as a building block class for your 038 application's Model Objects, for modeling all free form user input 039 <li>the {@link Populate} tag used with forms 040 <li>the {@link Report} class used for creating quick reports 041 <li>the {@link Text}, {@link TextFlow}, and {@link Tooltips} custom tags used 042 for translation 043 </ul> 044 */ 045 public final class EscapeChars { 046 047 /** 048 Escape characters for text appearing in HTML markup. 049 050 <P>This method exists as a defence against Cross Site Scripting (XSS) hacks. 051 The idea is to neutralize control characters commonly used by scripts, such that 052 they will not be executed by the browser. This is done by replacing the control 053 characters with their escaped equivalents. 054 See {@link hirondelle.web4j.security.SafeText} as well. 055 056 <P>The following characters are replaced with corresponding 057 HTML character entities : 058 <table border='1' cellpadding='3' cellspacing='0'> 059 <tr><th> Character </th><th>Replacement</th></tr> 060 <tr><td> < </td><td> &lt; </td></tr> 061 <tr><td> > </td><td> &gt; </td></tr> 062 <tr><td> & </td><td> &amp; </td></tr> 063 <tr><td> " </td><td> &quot;</td></tr> 064 <tr><td> \t </td><td> &#009;</td></tr> 065 <tr><td> ! </td><td> &#033;</td></tr> 066 <tr><td> # </td><td> &#035;</td></tr> 067 <tr><td> $ </td><td> &#036;</td></tr> 068 <tr><td> % </td><td> &#037;</td></tr> 069 <tr><td> ' </td><td> &#039;</td></tr> 070 <tr><td> ( </td><td> &#040;</td></tr> 071 <tr><td> ) </td><td> &#041;</td></tr> 072 <tr><td> * </td><td> &#042;</td></tr> 073 <tr><td> + </td><td> &#043; </td></tr> 074 <tr><td> , </td><td> &#044; </td></tr> 075 <tr><td> - </td><td> &#045; </td></tr> 076 <tr><td> . </td><td> &#046; </td></tr> 077 <tr><td> / </td><td> &#047; </td></tr> 078 <tr><td> : </td><td> &#058;</td></tr> 079 <tr><td> ; </td><td> &#059;</td></tr> 080 <tr><td> = </td><td> &#061;</td></tr> 081 <tr><td> ? </td><td> &#063;</td></tr> 082 <tr><td> @ </td><td> &#064;</td></tr> 083 <tr><td> [ </td><td> &#091;</td></tr> 084 <tr><td> \ </td><td> &#092;</td></tr> 085 <tr><td> ] </td><td> &#093;</td></tr> 086 <tr><td> ^ </td><td> &#094;</td></tr> 087 <tr><td> _ </td><td> &#095;</td></tr> 088 <tr><td> ` </td><td> &#096;</td></tr> 089 <tr><td> { </td><td> &#123;</td></tr> 090 <tr><td> | </td><td> &#124;</td></tr> 091 <tr><td> } </td><td> &#125;</td></tr> 092 <tr><td> ~ </td><td> &#126;</td></tr> 093 </table> 094 095 <P>Note that JSTL's {@code <c:out>} escapes <em>only the first 096 five</em> of the above characters. 097 */ 098 public static String forHTML(String aText){ 099 final StringBuilder result = new StringBuilder(); 100 final StringCharacterIterator iterator = new StringCharacterIterator(aText); 101 char character = iterator.current(); 102 while (character != CharacterIterator.DONE ){ 103 if (character == '<') { 104 result.append("<"); 105 } 106 else if (character == '>') { 107 result.append(">"); 108 } 109 else if (character == '&') { 110 result.append("&"); 111 } 112 else if (character == '\"') { 113 result.append("""); 114 } 115 else if (character == '\t') { 116 addCharEntity(9, result); 117 } 118 else if (character == '!') { 119 addCharEntity(33, result); 120 } 121 else if (character == '#') { 122 addCharEntity(35, result); 123 } 124 else if (character == '$') { 125 addCharEntity(36, result); 126 } 127 else if (character == '%') { 128 addCharEntity(37, result); 129 } 130 else if (character == '\'') { 131 addCharEntity(39, result); 132 } 133 else if (character == '(') { 134 addCharEntity(40, result); 135 } 136 else if (character == ')') { 137 addCharEntity(41, result); 138 } 139 else if (character == '*') { 140 addCharEntity(42, result); 141 } 142 else if (character == '+') { 143 addCharEntity(43, result); 144 } 145 else if (character == ',') { 146 addCharEntity(44, result); 147 } 148 else if (character == '-') { 149 addCharEntity(45, result); 150 } 151 else if (character == '.') { 152 addCharEntity(46, result); 153 } 154 else if (character == '/') { 155 addCharEntity(47, result); 156 } 157 else if (character == ':') { 158 addCharEntity(58, result); 159 } 160 else if (character == ';') { 161 addCharEntity(59, result); 162 } 163 else if (character == '=') { 164 addCharEntity(61, result); 165 } 166 else if (character == '?') { 167 addCharEntity(63, result); 168 } 169 else if (character == '@') { 170 addCharEntity(64, result); 171 } 172 else if (character == '[') { 173 addCharEntity(91, result); 174 } 175 else if (character == '\\') { 176 addCharEntity(92, result); 177 } 178 else if (character == ']') { 179 addCharEntity(93, result); 180 } 181 else if (character == '^') { 182 addCharEntity(94, result); 183 } 184 else if (character == '_') { 185 addCharEntity(95, result); 186 } 187 else if (character == '`') { 188 addCharEntity(96, result); 189 } 190 else if (character == '{') { 191 addCharEntity(123, result); 192 } 193 else if (character == '|') { 194 addCharEntity(124, result); 195 } 196 else if (character == '}') { 197 addCharEntity(125, result); 198 } 199 else if (character == '~') { 200 addCharEntity(126, result); 201 } 202 else { 203 //the char is not a special one 204 //add it to the result as is 205 result.append(character); 206 } 207 character = iterator.next(); 208 } 209 return result.toString(); 210 } 211 212 213 /** 214 Escape all ampersand characters in a URL. 215 216 <P>Replaces all <tt>'&'</tt> characters with <tt>'&amp;'</tt>. 217 218 <P>An ampersand character may appear in the query string of a URL. 219 The ampersand character is indeed valid in a URL. 220 <em>However, URLs usually appear as an <tt>HREF</tt> attribute, and 221 such attributes have the additional constraint that ampersands 222 must be escaped.</em> 223 224 <P>The JSTL <c:url> tag does indeed perform proper URL encoding of 225 query parameters. But it does not, in general, produce text which 226 is valid as an <tt>HREF</tt> attribute, simply because it does 227 not escape the ampersand character. This is a nuisance when 228 multiple query parameters appear in the URL, since it requires a little 229 extra work. 230 */ 231 public static String forHrefAmpersand(String aURL){ 232 return aURL.replace("&", "&"); 233 } 234 235 /** 236 Synonym for <tt>URLEncoder.encode(String, "UTF-8")</tt>. 237 238 <P>Used to ensure that HTTP query strings are in proper form, by escaping 239 special characters such as spaces. 240 241 <P>It is important to note that if a query string appears in an <tt>HREF</tt> 242 attribute, then there are two issues - ensuring the query string is valid HTTP 243 (it is URL-encoded), and ensuring it is valid HTML (ensuring the 244 ampersand is escaped). 245 */ 246 public static String forURL(String aURLFragment){ 247 String result = null; 248 try { 249 result = URLEncoder.encode(aURLFragment, "UTF-8"); 250 } 251 catch (UnsupportedEncodingException ex){ 252 throw new RuntimeException("UTF-8 not supported", ex); 253 } 254 return result; 255 } 256 257 /** 258 Escape characters for text appearing as XML data, between tags. 259 260 <P>The following characters are replaced with corresponding character entities : 261 <table border='1' cellpadding='3' cellspacing='0'> 262 <tr><th> Character </th><th> Encoding </th></tr> 263 <tr><td> < </td><td> &lt; </td></tr> 264 <tr><td> > </td><td> &gt; </td></tr> 265 <tr><td> & </td><td> &amp; </td></tr> 266 <tr><td> " </td><td> &quot;</td></tr> 267 <tr><td> ' </td><td> &#039;</td></tr> 268 </table> 269 270 <P>Note that JSTL's {@code <c:out>} escapes the exact same set of 271 characters as this method. <span class='highlight'>That is, {@code <c:out>} 272 is good for escaping to produce valid XML, but not for producing safe 273 HTML.</span> 274 */ 275 public static String forXML(String aText){ 276 final StringBuilder result = new StringBuilder(); 277 final StringCharacterIterator iterator = new StringCharacterIterator(aText); 278 char character = iterator.current(); 279 while (character != CharacterIterator.DONE ){ 280 if (character == '<') { 281 result.append("<"); 282 } 283 else if (character == '>') { 284 result.append(">"); 285 } 286 else if (character == '\"') { 287 result.append("""); 288 } 289 else if (character == '\'') { 290 result.append("'"); 291 } 292 else if (character == '&') { 293 result.append("&"); 294 } 295 else { 296 //the char is not a special one 297 //add it to the result as is 298 result.append(character); 299 } 300 character = iterator.next(); 301 } 302 return result.toString(); 303 } 304 305 /** 306 Escapes characters for text appearing as data in the 307 <a href='http://www.json.org/'>Javascript Object Notation</a> 308 (JSON) data interchange format. 309 310 <P>The following commonly used control characters are escaped : 311 <table border='1' cellpadding='3' cellspacing='0'> 312 <tr><th> Character </th><th> Escaped As </th></tr> 313 <tr><td> " </td><td> \" </td></tr> 314 <tr><td> \ </td><td> \\ </td></tr> 315 <tr><td> / </td><td> \/ </td></tr> 316 <tr><td> back space </td><td> \b </td></tr> 317 <tr><td> form feed </td><td> \f </td></tr> 318 <tr><td> line feed </td><td> \n </td></tr> 319 <tr><td> carriage return </td><td> \r </td></tr> 320 <tr><td> tab </td><td> \t </td></tr> 321 </table> 322 323 <P>See <a href='http://www.ietf.org/rfc/rfc4627.txt'>RFC 4627</a> for more information. 324 */ 325 public static String forJSON(String aText){ 326 final StringBuilder result = new StringBuilder(); 327 StringCharacterIterator iterator = new StringCharacterIterator(aText); 328 char character = iterator.current(); 329 while (character != StringCharacterIterator.DONE){ 330 if( character == '\"' ){ 331 result.append("\\\""); 332 } 333 else if(character == '\\'){ 334 result.append("\\\\"); 335 } 336 else if(character == '/'){ 337 result.append("\\/"); 338 } 339 else if(character == '\b'){ 340 result.append("\\b"); 341 } 342 else if(character == '\f'){ 343 result.append("\\f"); 344 } 345 else if(character == '\n'){ 346 result.append("\\n"); 347 } 348 else if(character == '\r'){ 349 result.append("\\r"); 350 } 351 else if(character == '\t'){ 352 result.append("\\t"); 353 } 354 else { 355 //the char is not a special one 356 //add it to the result as is 357 result.append(character); 358 } 359 character = iterator.next(); 360 } 361 return result.toString(); 362 } 363 364 /** 365 Return <tt>aText</tt> with all <tt>'<'</tt> and <tt>'>'</tt> characters 366 replaced by their escaped equivalents. 367 */ 368 public static String toDisableTags(String aText){ 369 final StringBuilder result = new StringBuilder(); 370 final StringCharacterIterator iterator = new StringCharacterIterator(aText); 371 char character = iterator.current(); 372 while (character != CharacterIterator.DONE ){ 373 if (character == '<') { 374 result.append("<"); 375 } 376 else if (character == '>') { 377 result.append(">"); 378 } 379 else { 380 //the char is not a special one 381 //add it to the result as is 382 result.append(character); 383 } 384 character = iterator.next(); 385 } 386 return result.toString(); 387 } 388 389 390 /** 391 Replace characters having special meaning in regular expressions 392 with their escaped equivalents, preceded by a '\' character. 393 394 <P>The escaped characters include : 395 <ul> 396 <li>. 397 <li>\ 398 <li>?, * , and + 399 <li>& 400 <li>: 401 <li>{ and } 402 <li>[ and ] 403 <li>( and ) 404 <li>^ and $ 405 </ul> 406 */ 407 public static String forRegex(String aRegexFragment){ 408 final StringBuilder result = new StringBuilder(); 409 410 final StringCharacterIterator iterator = 411 new StringCharacterIterator(aRegexFragment) 412 ; 413 char character = iterator.current(); 414 while (character != CharacterIterator.DONE ){ 415 /* 416 All literals need to have backslashes doubled. 417 */ 418 if (character == '.') { 419 result.append("\\."); 420 } 421 else if (character == '\\') { 422 result.append("\\\\"); 423 } 424 else if (character == '?') { 425 result.append("\\?"); 426 } 427 else if (character == '*') { 428 result.append("\\*"); 429 } 430 else if (character == '+') { 431 result.append("\\+"); 432 } 433 else if (character == '&') { 434 result.append("\\&"); 435 } 436 else if (character == ':') { 437 result.append("\\:"); 438 } 439 else if (character == '{') { 440 result.append("\\{"); 441 } 442 else if (character == '}') { 443 result.append("\\}"); 444 } 445 else if (character == '[') { 446 result.append("\\["); 447 } 448 else if (character == ']') { 449 result.append("\\]"); 450 } 451 else if (character == '(') { 452 result.append("\\("); 453 } 454 else if (character == ')') { 455 result.append("\\)"); 456 } 457 else if (character == '^') { 458 result.append("\\^"); 459 } 460 else if (character == '$') { 461 result.append("\\$"); 462 } 463 else { 464 //the char is not a special one 465 //add it to the result as is 466 result.append(character); 467 } 468 character = iterator.next(); 469 } 470 return result.toString(); 471 } 472 473 /** 474 Escape <tt>'$'</tt> and <tt>'\'</tt> characters in replacement strings. 475 476 <P>Synonym for <tt>Matcher.quoteReplacement(String)</tt>. 477 478 <P>The following methods use replacement strings which treat 479 <tt>'$'</tt> and <tt>'\'</tt> as special characters: 480 <ul> 481 <li><tt>String.replaceAll(String, String)</tt> 482 <li><tt>String.replaceFirst(String, String)</tt> 483 <li><tt>Matcher.appendReplacement(StringBuffer, String)</tt> 484 </ul> 485 486 <P>If replacement text can contain arbitrary characters, then you 487 will usually need to escape that text, to ensure special characters 488 are interpreted literally. 489 */ 490 public static String forReplacementString(String aInput){ 491 return Matcher.quoteReplacement(aInput); 492 } 493 494 /** 495 Disable all <tt><SCRIPT></tt> tags in <tt>aText</tt>. 496 497 <P>Insensitive to case. 498 */ 499 public static String forScriptTagsOnly(String aText){ 500 String result = null; 501 Matcher matcher = SCRIPT.matcher(aText); 502 result = matcher.replaceAll("<SCRIPT>"); 503 matcher = SCRIPT_END.matcher(result); 504 result = matcher.replaceAll("</SCRIPT>"); 505 return result; 506 } 507 508 // PRIVATE // 509 510 private EscapeChars(){ 511 //empty - prevent construction 512 } 513 514 private static final Pattern SCRIPT = Pattern.compile( 515 "<SCRIPT>", Pattern.CASE_INSENSITIVE 516 ); 517 private static final Pattern SCRIPT_END = Pattern.compile( 518 "</SCRIPT>", Pattern.CASE_INSENSITIVE 519 ); 520 521 private static void addCharEntity(Integer aIdx, StringBuilder aBuilder){ 522 String padding = ""; 523 if( aIdx <= 9 ){ 524 padding = "00"; 525 } 526 else if( aIdx <= 99 ){ 527 padding = "0"; 528 } 529 else { 530 //no prefix 531 } 532 String number = padding + aIdx.toString(); 533 aBuilder.append("&#" + number + ";"); 534 } 535 }