001    package hirondelle.web4j.util;
002    
003    import java.net.URLEncoder;
004    import java.io.UnsupportedEncodingException;
005    import java.text.CharacterIterator;
006    import java.text.StringCharacterIterator;
007    import java.util.regex.Pattern;
008    import java.util.regex.Matcher;
009    
010    import hirondelle.web4j.security.SafeText;
011    import hirondelle.web4j.ui.translate.Text;
012    import hirondelle.web4j.ui.translate.Tooltips;
013    import hirondelle.web4j.ui.translate.TextFlow;
014    import hirondelle.web4j.ui.tag.Populate;
015    import hirondelle.web4j.database.Report;
016    
017    /**
018     Convenience methods for escaping special characters related to HTML, XML, 
019     and regular expressions.
020     
021     <P>To keep you safe by default, WEB4J goes to some effort to escape 
022     characters in your data when appropriate, such that you <em>usually</em>
023     don't need to think too much about escaping special characters. Thus, you
024      shouldn't need to <em>directly</em> use the services of this class very often. 
025     
026     <P><span class='highlight'>For Model Objects containing free form user input, 
027     it is highly recommended that you use {@link SafeText}, not <tt>String</tt></span>.
028     Free form user input is open to malicious use, such as
029     <a href='http://www.owasp.org/index.php/Cross_Site_Scripting'>Cross Site Scripting</a>
030     attacks. 
031     Using <tt>SafeText</tt> will protect you from such attacks, by always escaping 
032     special characters automatically in its <tt>toString()</tt> method.   
033     
034     <P>The following WEB4J classes will automatically escape special characters 
035     for you, when needed : 
036     <ul>
037     <li>the {@link SafeText} class, used as a building block class for your 
038     application's Model Objects, for modeling all free form user input
039     <li>the {@link Populate} tag used with forms
040     <li>the {@link Report} class used for creating quick reports
041     <li>the {@link Text}, {@link TextFlow}, and {@link Tooltips} custom tags used 
042     for translation
043     </ul> 
044    */
045    public final class EscapeChars {
046    
047      /**
048        Escape characters for text appearing in HTML markup.
049        
050        <P>This method exists as a defence against Cross Site Scripting (XSS) hacks.
051        The idea is to neutralize control characters commonly used by scripts, such that
052        they will not be executed by the browser. This is done by replacing the control
053        characters with their escaped equivalents.  
054        See {@link hirondelle.web4j.security.SafeText} as well.
055        
056        <P>The following characters are replaced with corresponding 
057        HTML character entities :
058        <table border='1' cellpadding='3' cellspacing='0'>
059        <tr><th> Character </th><th>Replacement</th></tr>
060        <tr><td> &lt; </td><td> &amp;lt; </td></tr>
061        <tr><td> &gt; </td><td> &amp;gt; </td></tr>
062        <tr><td> &amp; </td><td> &amp;amp; </td></tr>
063        <tr><td> " </td><td> &amp;quot;</td></tr>
064        <tr><td> \t </td><td> &amp;#009;</td></tr>
065        <tr><td> ! </td><td> &amp;#033;</td></tr>
066        <tr><td> # </td><td> &amp;#035;</td></tr>
067        <tr><td> $ </td><td> &amp;#036;</td></tr>
068        <tr><td> % </td><td> &amp;#037;</td></tr>
069        <tr><td> ' </td><td> &amp;#039;</td></tr>
070        <tr><td> ( </td><td> &amp;#040;</td></tr> 
071        <tr><td> ) </td><td> &amp;#041;</td></tr>
072        <tr><td> * </td><td> &amp;#042;</td></tr>
073        <tr><td> + </td><td> &amp;#043; </td></tr>
074        <tr><td> , </td><td> &amp;#044; </td></tr>
075        <tr><td> - </td><td> &amp;#045; </td></tr>
076        <tr><td> . </td><td> &amp;#046; </td></tr>
077        <tr><td> / </td><td> &amp;#047; </td></tr>
078        <tr><td> : </td><td> &amp;#058;</td></tr>
079        <tr><td> ; </td><td> &amp;#059;</td></tr>
080        <tr><td> = </td><td> &amp;#061;</td></tr>
081        <tr><td> ? </td><td> &amp;#063;</td></tr>
082        <tr><td> @ </td><td> &amp;#064;</td></tr>
083        <tr><td> [ </td><td> &amp;#091;</td></tr>
084        <tr><td> \ </td><td> &amp;#092;</td></tr>
085        <tr><td> ] </td><td> &amp;#093;</td></tr>
086        <tr><td> ^ </td><td> &amp;#094;</td></tr>
087        <tr><td> _ </td><td> &amp;#095;</td></tr>
088        <tr><td> ` </td><td> &amp;#096;</td></tr>
089        <tr><td> { </td><td> &amp;#123;</td></tr>
090        <tr><td> | </td><td> &amp;#124;</td></tr>
091        <tr><td> } </td><td> &amp;#125;</td></tr>
092        <tr><td> ~ </td><td> &amp;#126;</td></tr>
093        </table>
094        
095        <P>Note that JSTL's {@code <c:out>} escapes <em>only the first 
096        five</em> of the above characters.
097       */
098       public static String forHTML(String aText){
099         final StringBuilder result = new StringBuilder();
100         final StringCharacterIterator iterator = new StringCharacterIterator(aText);
101         char character =  iterator.current();
102         while (character != CharacterIterator.DONE ){
103           if (character == '<') {
104             result.append("&lt;");
105           }
106           else if (character == '>') {
107             result.append("&gt;");
108           }
109           else if (character == '&') {
110             result.append("&amp;");
111          }
112           else if (character == '\"') {
113             result.append("&quot;");
114           }
115           else if (character == '\t') {
116             addCharEntity(9, result);
117           }
118           else if (character == '!') {
119             addCharEntity(33, result);
120           }
121           else if (character == '#') {
122             addCharEntity(35, result);
123           }
124           else if (character == '$') {
125             addCharEntity(36, result);
126           }
127           else if (character == '%') {
128             addCharEntity(37, result);
129           }
130           else if (character == '\'') {
131             addCharEntity(39, result);
132           }
133           else if (character == '(') {
134             addCharEntity(40, result);
135           }
136           else if (character == ')') {
137             addCharEntity(41, result);
138           }
139           else if (character == '*') {
140             addCharEntity(42, result);
141           }
142           else if (character == '+') {
143             addCharEntity(43, result);
144           }
145           else if (character == ',') {
146             addCharEntity(44, result);
147           }
148           else if (character == '-') {
149             addCharEntity(45, result);
150           }
151           else if (character == '.') {
152             addCharEntity(46, result);
153           }
154           else if (character == '/') {
155             addCharEntity(47, result);
156           }
157           else if (character == ':') {
158             addCharEntity(58, result);
159           }
160           else if (character == ';') {
161             addCharEntity(59, result);
162           }
163           else if (character == '=') {
164             addCharEntity(61, result);
165           }
166           else if (character == '?') {
167             addCharEntity(63, result);
168           }
169           else if (character == '@') {
170             addCharEntity(64, result);
171           }
172           else if (character == '[') {
173             addCharEntity(91, result);
174           }
175           else if (character == '\\') {
176             addCharEntity(92, result);
177           }
178           else if (character == ']') {
179             addCharEntity(93, result);
180           }
181           else if (character == '^') {
182             addCharEntity(94, result);
183           }
184           else if (character == '_') {
185             addCharEntity(95, result);
186           }
187           else if (character == '`') {
188             addCharEntity(96, result);
189           }
190           else if (character == '{') {
191             addCharEntity(123, result);
192           }
193           else if (character == '|') {
194             addCharEntity(124, result);
195           }
196           else if (character == '}') {
197             addCharEntity(125, result);
198           }
199           else if (character == '~') {
200             addCharEntity(126, result);
201           }
202           else {
203             //the char is not a special one
204             //add it to the result as is
205             result.append(character);
206           }
207           character = iterator.next();
208         }
209         return result.toString();
210      }
211      
212    
213      /**
214       Escape all ampersand characters in a URL. 
215        
216       <P>Replaces all <tt>'&amp;'</tt> characters with <tt>'&amp;amp;'</tt>.
217       
218      <P>An ampersand character may appear in the query string of a URL.
219       The ampersand character is indeed valid in a URL.
220       <em>However, URLs usually appear as an <tt>HREF</tt> attribute, and 
221       such attributes have the additional constraint that ampersands 
222       must be escaped.</em>
223       
224       <P>The JSTL &lt;c:url&gt; tag does indeed perform proper URL encoding of 
225       query parameters. But it does not, in general, produce text which 
226       is valid as an <tt>HREF</tt> attribute, simply because it does 
227       not escape the ampersand character. This is a nuisance when 
228       multiple query parameters appear in the URL, since it requires a little 
229       extra work.
230      */
231      public static String forHrefAmpersand(String aURL){
232        return aURL.replace("&", "&amp;");
233      }
234       
235      /**
236        Synonym for <tt>URLEncoder.encode(String, "UTF-8")</tt>.
237       
238        <P>Used to ensure that HTTP query strings are in proper form, by escaping
239        special characters such as spaces.
240       
241        <P>It is important to note that if a query string appears in an <tt>HREF</tt>
242        attribute, then there are two issues - ensuring the query string is valid HTTP
243        (it is URL-encoded), and ensuring it is valid HTML (ensuring the 
244        ampersand is escaped).
245       */
246       public static String forURL(String aURLFragment){
247         String result = null;
248         try {
249           result = URLEncoder.encode(aURLFragment, "UTF-8");
250         }
251         catch (UnsupportedEncodingException ex){
252           throw new RuntimeException("UTF-8 not supported", ex);
253         }
254         return result;
255       }
256    
257      /**
258       Escape characters for text appearing as XML data, between tags.
259       
260       <P>The following characters are replaced with corresponding character entities :
261       <table border='1' cellpadding='3' cellspacing='0'>
262       <tr><th> Character </th><th> Encoding </th></tr>
263       <tr><td> &lt; </td><td> &amp;lt; </td></tr>
264       <tr><td> &gt; </td><td> &amp;gt; </td></tr>
265       <tr><td> &amp; </td><td> &amp;amp; </td></tr>
266       <tr><td> " </td><td> &amp;quot;</td></tr>
267       <tr><td> ' </td><td> &amp;#039;</td></tr>
268       </table>
269       
270       <P>Note that JSTL's {@code <c:out>} escapes the exact same set of 
271       characters as this method. <span class='highlight'>That is, {@code <c:out>}
272        is good for escaping to produce valid XML, but not for producing safe 
273        HTML.</span>
274      */
275      public static String forXML(String aText){
276        final StringBuilder result = new StringBuilder();
277        final StringCharacterIterator iterator = new StringCharacterIterator(aText);
278        char character =  iterator.current();
279        while (character != CharacterIterator.DONE ){
280          if (character == '<') {
281            result.append("&lt;");
282          }
283          else if (character == '>') {
284            result.append("&gt;");
285          }
286          else if (character == '\"') {
287            result.append("&quot;");
288          }
289          else if (character == '\'') {
290            result.append("&#039;");
291          }
292          else if (character == '&') {
293             result.append("&amp;");
294          }
295          else {
296            //the char is not a special one
297            //add it to the result as is
298            result.append(character);
299          }
300          character = iterator.next();
301        }
302        return result.toString();
303      }
304      
305      /**
306       Escapes characters for text appearing as data in the 
307       <a href='http://www.json.org/'>Javascript Object Notation</a>
308       (JSON) data interchange format.
309       
310       <P>The following commonly used control characters are escaped :
311       <table border='1' cellpadding='3' cellspacing='0'>
312       <tr><th> Character </th><th> Escaped As </th></tr>
313       <tr><td> " </td><td> \" </td></tr>
314       <tr><td> \ </td><td> \\ </td></tr>
315       <tr><td> / </td><td> \/ </td></tr>
316       <tr><td> back space </td><td> \b </td></tr> 
317       <tr><td> form feed </td><td> \f </td></tr>
318       <tr><td> line feed </td><td> \n </td></tr>
319       <tr><td> carriage return </td><td> \r </td></tr>
320       <tr><td> tab </td><td> \t </td></tr>
321       </table>
322       
323       <P>See <a href='http://www.ietf.org/rfc/rfc4627.txt'>RFC 4627</a> for more information.
324      */
325      public static String forJSON(String aText){
326        final StringBuilder result = new StringBuilder();
327        StringCharacterIterator iterator = new StringCharacterIterator(aText);
328        char character = iterator.current();
329        while (character != StringCharacterIterator.DONE){
330          if( character == '\"' ){
331            result.append("\\\"");
332          }
333          else if(character == '\\'){
334            result.append("\\\\");
335          }
336          else if(character == '/'){
337            result.append("\\/");
338          }
339          else if(character == '\b'){
340            result.append("\\b");
341          }
342          else if(character == '\f'){
343            result.append("\\f");
344          }
345          else if(character == '\n'){
346            result.append("\\n");
347          }
348          else if(character == '\r'){
349            result.append("\\r");
350          }
351          else if(character == '\t'){
352            result.append("\\t");
353          }
354          else {
355            //the char is not a special one
356            //add it to the result as is
357            result.append(character);
358          }
359          character = iterator.next();
360        }
361        return result.toString();    
362      }
363    
364      /**
365       Return <tt>aText</tt> with all <tt>'&lt;'</tt> and <tt>'&gt;'</tt> characters
366       replaced by their escaped equivalents.
367      */
368      public static String toDisableTags(String aText){
369        final StringBuilder result = new StringBuilder();
370        final StringCharacterIterator iterator = new StringCharacterIterator(aText);
371        char character =  iterator.current();
372        while (character != CharacterIterator.DONE ){
373          if (character == '<') {
374            result.append("&lt;");
375          }
376          else if (character == '>') {
377            result.append("&gt;");
378          }
379          else {
380            //the char is not a special one
381            //add it to the result as is
382            result.append(character);
383          }
384          character = iterator.next();
385        }
386        return result.toString();
387      }
388      
389    
390      /**
391       Replace characters having special meaning in regular expressions
392       with their escaped equivalents, preceded by a '\' character.
393      
394       <P>The escaped characters include :
395      <ul>
396      <li>.
397      <li>\
398      <li>?, * , and +
399      <li>&
400      <li>:
401      <li>{ and }
402      <li>[ and ]
403      <li>( and )
404      <li>^ and $
405      </ul>
406      */
407      public static String forRegex(String aRegexFragment){
408        final StringBuilder result = new StringBuilder();
409    
410        final StringCharacterIterator iterator = 
411          new StringCharacterIterator(aRegexFragment)
412        ;
413        char character =  iterator.current();
414        while (character != CharacterIterator.DONE ){
415          /*
416           All literals need to have backslashes doubled.
417          */
418          if (character == '.') {
419            result.append("\\.");
420          }
421          else if (character == '\\') {
422            result.append("\\\\");
423          }
424          else if (character == '?') {
425            result.append("\\?");
426          }
427          else if (character == '*') {
428            result.append("\\*");
429          }
430          else if (character == '+') {
431            result.append("\\+");
432          }
433          else if (character == '&') {
434            result.append("\\&");
435          }
436          else if (character == ':') {
437            result.append("\\:");
438          }
439          else if (character == '{') {
440            result.append("\\{");
441          }
442          else if (character == '}') {
443            result.append("\\}");
444          }
445          else if (character == '[') {
446            result.append("\\[");
447          }
448          else if (character == ']') {
449            result.append("\\]");
450          }
451          else if (character == '(') {
452            result.append("\\(");
453          }
454          else if (character == ')') {
455            result.append("\\)");
456          }
457          else if (character == '^') {
458            result.append("\\^");
459          }
460          else if (character == '$') {
461            result.append("\\$");
462          }
463          else {
464            //the char is not a special one
465            //add it to the result as is
466            result.append(character);
467          }
468          character = iterator.next();
469        }
470        return result.toString();
471      }
472      
473      /**
474       Escape <tt>'$'</tt> and <tt>'\'</tt> characters in replacement strings.
475       
476       <P>Synonym for <tt>Matcher.quoteReplacement(String)</tt>.
477       
478       <P>The following methods use replacement strings which treat 
479       <tt>'$'</tt> and <tt>'\'</tt> as special characters:
480       <ul>
481       <li><tt>String.replaceAll(String, String)</tt>
482       <li><tt>String.replaceFirst(String, String)</tt>
483       <li><tt>Matcher.appendReplacement(StringBuffer, String)</tt>
484       </ul>
485       
486       <P>If replacement text can contain arbitrary characters, then you 
487       will usually need to escape that text, to ensure special characters 
488       are interpreted literally.
489      */
490      public static String forReplacementString(String aInput){
491        return Matcher.quoteReplacement(aInput);
492      }
493      
494      /**
495       Disable all <tt>&lt;SCRIPT&gt;</tt> tags in <tt>aText</tt>.
496       
497       <P>Insensitive to case.
498      */  
499      public static String forScriptTagsOnly(String aText){
500        String result = null;
501        Matcher matcher = SCRIPT.matcher(aText);
502        result = matcher.replaceAll("&lt;SCRIPT&gt;");
503        matcher = SCRIPT_END.matcher(result);
504        result = matcher.replaceAll("&lt;/SCRIPT&gt;");
505        return result;
506      }
507      
508      // PRIVATE //
509      
510      private EscapeChars(){
511        //empty - prevent construction
512      }
513      
514      private static final Pattern SCRIPT = Pattern.compile(
515        "<SCRIPT>", Pattern.CASE_INSENSITIVE
516       );
517      private static final Pattern SCRIPT_END = Pattern.compile(
518        "</SCRIPT>", Pattern.CASE_INSENSITIVE
519      );
520      
521      private static void addCharEntity(Integer aIdx, StringBuilder aBuilder){
522        String padding = "";
523        if( aIdx <= 9 ){
524           padding = "00";
525        }
526        else if( aIdx <= 99 ){
527          padding = "0";
528        }
529        else {
530          //no prefix
531        }
532        String number = padding + aIdx.toString();
533        aBuilder.append("&#" + number + ";");
534      }
535    }