001    /**
002     * Copyright (c) 2000-2013 Liferay, Inc. All rights reserved.
003     *
004     * This library is free software; you can redistribute it and/or modify it under
005     * the terms of the GNU Lesser General Public License as published by the Free
006     * Software Foundation; either version 2.1 of the License, or (at your option)
007     * any later version.
008     *
009     * This library is distributed in the hope that it will be useful, but WITHOUT
010     * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
011     * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
012     * details.
013     */
014    
015    package com.liferay.portal.util;
016    
017    import com.liferay.portal.kernel.security.pacl.DoPrivileged;
018    import com.liferay.portal.kernel.util.CharPool;
019    import com.liferay.portal.kernel.util.Html;
020    import com.liferay.portal.kernel.util.HttpUtil;
021    import com.liferay.portal.kernel.util.StringBundler;
022    import com.liferay.portal.kernel.util.StringPool;
023    import com.liferay.portal.kernel.util.StringUtil;
024    import com.liferay.portal.kernel.util.Validator;
025    
026    import java.util.regex.Matcher;
027    import java.util.regex.Pattern;
028    
029    import net.htmlparser.jericho.Renderer;
030    import net.htmlparser.jericho.Source;
031    import net.htmlparser.jericho.TextExtractor;
032    
033    /**
034     * @author Brian Wing Shun Chan
035     * @author Clarence Shen
036     * @author Harry Mark
037     * @author Samuel Kong
038     * @author Connor McKay
039     * @author Shuyang Zhou
040     */
041    @DoPrivileged
042    public class HtmlImpl implements Html {
043    
044            public static final int ESCAPE_MODE_ATTRIBUTE = 1;
045    
046            public static final int ESCAPE_MODE_CSS = 2;
047    
048            public static final int ESCAPE_MODE_JS = 3;
049    
050            public static final int ESCAPE_MODE_TEXT = 4;
051    
052            public static final int ESCAPE_MODE_URL = 5;
053    
054            @Override
055            public String escape(String text) {
056                    if (text == null) {
057                            return null;
058                    }
059    
060                    if (text.length() == 0) {
061                            return StringPool.BLANK;
062                    }
063    
064                    // Escape using XSS recommendations from
065                    // http://www.owasp.org/index.php/Cross_Site_Scripting
066                    // #How_to_Protect_Yourself
067    
068                    StringBundler sb = null;
069    
070                    int lastReplacementIndex = 0;
071    
072                    for (int i = 0; i < text.length(); i++) {
073                            char c = text.charAt(i);
074    
075                            String replacement = null;
076    
077                            switch (c) {
078                                    case '<':
079                                            replacement = "&lt;";
080    
081                                            break;
082    
083                                    case '>':
084                                            replacement = "&gt;";
085    
086                                            break;
087    
088                                    case '&':
089                                            replacement = "&amp;";
090    
091                                            break;
092    
093                                    case '"':
094                                            replacement = "&#034;";
095    
096                                            break;
097    
098                                    case '\'':
099                                            replacement = "&#039;";
100    
101                                            break;
102    
103                                    case '\u00bb': // '???'
104                                            replacement = "&#187;";
105    
106                                            break;
107    
108                                    case '\u2013':
109                                            replacement = "&#x2013;";
110    
111                                            break;
112    
113                                    case '\u2014':
114                                            replacement = "&#x2014;";
115    
116                                            break;
117                            }
118    
119                            if (replacement != null) {
120                                    if (sb == null) {
121                                            sb = new StringBundler();
122                                    }
123    
124                                    if (i > lastReplacementIndex) {
125                                            sb.append(text.substring(lastReplacementIndex, i));
126                                    }
127    
128                                    sb.append(replacement);
129    
130                                    lastReplacementIndex = i + 1;
131                            }
132                    }
133    
134                    if (sb == null) {
135                            return text;
136                    }
137    
138                    if (lastReplacementIndex < text.length()) {
139                            sb.append(text.substring(lastReplacementIndex));
140                    }
141    
142                    return sb.toString();
143            }
144    
145            @Override
146            public String escape(String text, int type) {
147                    if (text == null) {
148                            return null;
149                    }
150    
151                    if (text.length() == 0) {
152                            return StringPool.BLANK;
153                    }
154    
155                    String prefix = StringPool.BLANK;
156                    String postfix = StringPool.BLANK;
157    
158                    if (type == ESCAPE_MODE_ATTRIBUTE) {
159                            prefix = "&#x";
160                            postfix = StringPool.SEMICOLON;
161                    }
162                    else if (type == ESCAPE_MODE_CSS) {
163                            prefix = StringPool.BACK_SLASH;
164                    }
165                    else if (type == ESCAPE_MODE_JS) {
166                            prefix = "\\x";
167                    }
168                    else if (type == ESCAPE_MODE_URL) {
169                            return HttpUtil.encodeURL(text, true);
170                    }
171                    else {
172                            return escape(text);
173                    }
174    
175                    StringBuilder sb = new StringBuilder();
176    
177                    for (int i = 0; i < text.length(); i++) {
178                            char c = text.charAt(i);
179    
180                            if ((c > 255) || Character.isLetterOrDigit(c) ||
181                                    (c == CharPool.DASH) || (c == CharPool.UNDERLINE)) {
182    
183                                    sb.append(c);
184                            }
185                            else {
186                                    sb.append(prefix);
187    
188                                    String hexString = StringUtil.toHexString(c);
189    
190                                    if (hexString.length() == 1) {
191                                            sb.append(StringPool.ASCII_TABLE[48]);
192                                    }
193    
194                                    sb.append(hexString);
195                                    sb.append(postfix);
196                            }
197                    }
198    
199                    if (sb.length() == text.length()) {
200                            return text;
201                    }
202                    else {
203                            return sb.toString();
204                    }
205            }
206    
207            @Override
208            public String escapeAttribute(String attribute) {
209                    return escape(attribute, ESCAPE_MODE_ATTRIBUTE);
210            }
211    
212            @Override
213            public String escapeCSS(String css) {
214                    return escape(css, ESCAPE_MODE_CSS);
215            }
216    
217            @Override
218            public String escapeHREF(String href) {
219                    if (href == null) {
220                            return null;
221                    }
222    
223                    if (href.length() == 0) {
224                            return StringPool.BLANK;
225                    }
226    
227                    if (href.indexOf(StringPool.COLON) == 10) {
228                            String protocol = StringUtil.toLowerCase(href.substring(0, 10));
229    
230                            if (protocol.equals("javascript")) {
231                                    href = StringUtil.replaceFirst(href, StringPool.COLON, "%3a");
232                            }
233                    }
234    
235                    return escapeAttribute(href);
236            }
237    
238            @Override
239            public String escapeJS(String js) {
240                    return escape(js, ESCAPE_MODE_JS);
241            }
242    
243            @Override
244            public String escapeURL(String url) {
245                    return escape(url, ESCAPE_MODE_URL);
246            }
247    
248            @Override
249            public String escapeXPath(String xPath) {
250                    if (Validator.isNull(xPath)) {
251                            return xPath;
252                    }
253    
254                    StringBuilder sb = new StringBuilder(xPath.length());
255    
256                    for (int i = 0; i < xPath.length(); i++) {
257                            char c = xPath.charAt(i);
258    
259                            boolean hasToken = false;
260    
261                            for (int j = 0; j < _XPATH_TOKENS.length; j++) {
262                                    if (c == _XPATH_TOKENS[j]) {
263                                            hasToken = true;
264    
265                                            break;
266                                    }
267                            }
268    
269                            if (hasToken) {
270                                    sb.append(StringPool.UNDERLINE);
271                            }
272                            else {
273                                    sb.append(c);
274                            }
275                    }
276    
277                    return sb.toString();
278            }
279    
280            @Override
281            public String escapeXPathAttribute(String xPathAttribute) {
282                    boolean hasApostrophe = xPathAttribute.contains(StringPool.APOSTROPHE);
283                    boolean hasQuote = xPathAttribute.contains(StringPool.QUOTE);
284    
285                    if (hasQuote && hasApostrophe) {
286                            String[] parts = xPathAttribute.split(StringPool.APOSTROPHE);
287    
288                            return "concat('".concat(
289                                    StringUtil.merge(parts, "', \"'\", '")).concat("')");
290                    }
291    
292                    if (hasQuote) {
293                            return StringPool.APOSTROPHE.concat(xPathAttribute).concat(
294                                    StringPool.APOSTROPHE);
295                    }
296    
297                    return StringPool.QUOTE.concat(xPathAttribute).concat(StringPool.QUOTE);
298            }
299    
300            @Override
301            public String extractText(String html) {
302                    if (html == null) {
303                            return null;
304                    }
305    
306                    Source source = new Source(html);
307    
308                    TextExtractor textExtractor = source.getTextExtractor();
309    
310                    return textExtractor.toString();
311            }
312    
313            @Override
314            public String fromInputSafe(String text) {
315                    return StringUtil.replace(text, "&amp;", "&");
316            }
317    
318            @Override
319            public String render(String html) {
320                    if (html == null) {
321                            return null;
322                    }
323    
324                    Source source = new Source(html);
325    
326                    Renderer renderer = source.getRenderer();
327    
328                    return renderer.toString();
329            }
330    
331            @Override
332            public String replaceMsWordCharacters(String text) {
333                    return StringUtil.replace(text, _MS_WORD_UNICODE, _MS_WORD_HTML);
334            }
335    
336            @Override
337            public String replaceNewLine(String text) {
338                    if (text == null) {
339                            return null;
340                    }
341    
342                    return text.replaceAll("\r?\n", "<br />");
343            }
344    
345            @Override
346            public String stripBetween(String text, String tag) {
347                    return StringUtil.stripBetween(text, "<" + tag, "</" + tag + ">");
348            }
349    
350            @Override
351            public String stripComments(String text) {
352                    return StringUtil.stripBetween(text, "<!--", "-->");
353            }
354    
355            @Override
356            public String stripHtml(String text) {
357                    if (text == null) {
358                            return null;
359                    }
360    
361                    text = stripComments(text);
362    
363                    StringBuilder sb = new StringBuilder(text.length());
364    
365                    int x = 0;
366                    int y = text.indexOf("<");
367    
368                    while (y != -1) {
369                            sb.append(text.substring(x, y));
370                            sb.append(StringPool.SPACE);
371    
372                            // Look for text enclosed by <abc></abc>
373    
374                            if (isTag(_TAG_SCRIPT, text, y + 1)) {
375                                    y = stripTag(_TAG_SCRIPT, text, y);
376                            }
377                            else if (isTag(_TAG_STYLE, text, y + 1)) {
378                                    y = stripTag(_TAG_STYLE, text, y);
379                            }
380    
381                            x = text.indexOf(">", y);
382    
383                            if (x == -1) {
384                                    break;
385                            }
386    
387                            x++;
388    
389                            if (x < y) {
390    
391                                    // <b>Hello</b
392    
393                                    break;
394                            }
395    
396                            y = text.indexOf("<", x);
397                    }
398    
399                    if (y == -1) {
400                            sb.append(text.substring(x));
401                    }
402    
403                    return sb.toString();
404            }
405    
406            @Override
407            public String toInputSafe(String text) {
408                    return StringUtil.replace(
409                            text,
410                            new String[] {"&", "\""},
411                            new String[] {"&amp;", "&quot;"});
412            }
413    
414            @Override
415            public String unescape(String text) {
416                    if (text == null) {
417                            return null;
418                    }
419    
420                    if (text.length() == 0) {
421                            return StringPool.BLANK;
422                    }
423    
424                    // Optimize this
425    
426                    text = StringUtil.replace(text, "&lt;", "<");
427                    text = StringUtil.replace(text, "&gt;", ">");
428                    text = StringUtil.replace(text, "&amp;", "&");
429                    text = StringUtil.replace(text, "&#034;", "\"");
430                    text = StringUtil.replace(text, "&#039;", "'");
431                    text = StringUtil.replace(text, "&#040;", "(");
432                    text = StringUtil.replace(text, "&#041;", ")");
433                    text = StringUtil.replace(text, "&#044;", ",");
434                    text = StringUtil.replace(text, "&#035;", "#");
435                    text = StringUtil.replace(text, "&#037;", "%");
436                    text = StringUtil.replace(text, "&#059;", ";");
437                    text = StringUtil.replace(text, "&#061;", "=");
438                    text = StringUtil.replace(text, "&#043;", "+");
439                    text = StringUtil.replace(text, "&#045;", "-");
440    
441                    return text;
442            }
443    
444            @Override
445            public String unescapeCDATA(String text) {
446                    if (text == null) {
447                            return null;
448                    }
449    
450                    if (text.length() == 0) {
451                            return StringPool.BLANK;
452                    }
453    
454                    text = StringUtil.replace(text, "&lt;![CDATA[", "<![CDATA[");
455                    text = StringUtil.replace(text, "]]&gt;", "]]>");
456    
457                    return text;
458            }
459    
460            @Override
461            public String wordBreak(String text, int columns) {
462                    StringBundler sb = new StringBundler();
463    
464                    int length = 0;
465                    int lastWrite = 0;
466                    int pos = 0;
467    
468                    Pattern pattern = Pattern.compile("([\\s<&]|$)");
469    
470                    Matcher matcher = pattern.matcher(text);
471    
472                    while (matcher.find()) {
473                            if (matcher.start() < pos) {
474                                    continue;
475                            }
476    
477                            while ((length + matcher.start() - pos) >= columns) {
478                                    pos += columns - length;
479    
480                                    sb.append(text.substring(lastWrite, pos));
481                                    sb.append("<wbr/>&shy;");
482    
483                                    length = 0;
484                                    lastWrite = pos;
485                            }
486    
487                            length += matcher.start() - pos;
488    
489                            String group = matcher.group();
490    
491                            if (group.equals(StringPool.AMPERSAND)) {
492                                    int x = text.indexOf(StringPool.SEMICOLON, matcher.start());
493    
494                                    if (x != -1) {
495                                            length++;
496                                            pos = x + 1;
497                                    }
498    
499                                    continue;
500                            }
501    
502                            if (group.equals(StringPool.LESS_THAN)) {
503                                    int x = text.indexOf(StringPool.GREATER_THAN, matcher.start());
504    
505                                    if (x != -1) {
506                                            pos = x + 1;
507                                    }
508    
509                                    continue;
510                            }
511    
512                            if (group.equals(StringPool.SPACE) ||
513                                    group.equals(StringPool.NEW_LINE)) {
514    
515                                    length = 0;
516                                    pos = matcher.start() + 1;
517                            }
518                    }
519    
520                    sb.append(text.substring(lastWrite));
521    
522                    return sb.toString();
523            }
524    
525            protected boolean isTag(char[] tag, String text, int pos) {
526                    if ((pos + tag.length + 1) <= text.length()) {
527                            char item;
528    
529                            for (int i = 0; i < tag.length; i++) {
530                                    item = text.charAt(pos++);
531    
532                                    if (Character.toLowerCase(item) != tag[i]) {
533                                            return false;
534                                    }
535                            }
536    
537                            item = text.charAt(pos);
538    
539                            // Check that char after tag is not a letter (i.e. another tag)
540    
541                            return !Character.isLetter(item);
542                    }
543                    else {
544                            return false;
545                    }
546            }
547    
548            protected int stripTag(char[] tag, String text, int pos) {
549                    int x = pos + _TAG_SCRIPT.length;
550    
551                    // Find end of the tag
552    
553                    x = text.indexOf(">", x);
554    
555                    if (x < 0) {
556                            return pos;
557                    }
558    
559                    // Check if preceding character is / (i.e. is this instance of <abc/>)
560    
561                    if (text.charAt(x-1) == '/') {
562                            return pos;
563                    }
564    
565                    // Search for the ending </abc> tag
566    
567                    while (true) {
568                            x = text.indexOf("</", x);
569    
570                            if (x >= 0) {
571                                    if (isTag(tag, text, x + 2)) {
572                                            pos = x;
573    
574                                            break;
575                                    }
576                                    else {
577    
578                                            // Skip past "</"
579    
580                                            x += 2;
581                                    }
582                            }
583                            else {
584                                    break;
585                            }
586                    }
587    
588                    return pos;
589            }
590    
591            private static final String[] _MS_WORD_HTML = new String[] {
592                    "&reg;", StringPool.APOSTROPHE, StringPool.QUOTE, StringPool.QUOTE
593            };
594    
595            private static final String[] _MS_WORD_UNICODE = new String[] {
596                    "\u00ae", "\u2019", "\u201c", "\u201d"
597            };
598    
599            private static final char[] _TAG_SCRIPT = {'s', 'c', 'r', 'i', 'p', 't'};
600    
601            private static final char[] _TAG_STYLE = {'s', 't', 'y', 'l', 'e'};
602    
603            // See http://www.w3.org/TR/xpath20/#lexical-structure
604    
605            private static final char[] _XPATH_TOKENS = {
606                    '(', ')', '[', ']', '.', '@', ',', ':', '/', '|', '+', '-', '=', '!',
607                    '<', '>', '*', '$', '"', '"', ' ', 9, 10, 13, 133, 8232};
608    
609    }