001    /**
002     * Copyright (c) 2000-2013 Liferay, Inc. All rights reserved.
003     *
004     * This library is free software; you can redistribute it and/or modify it under
005     * the terms of the GNU Lesser General Public License as published by the Free
006     * Software Foundation; either version 2.1 of the License, or (at your option)
007     * any later version.
008     *
009     * This library is distributed in the hope that it will be useful, but WITHOUT
010     * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
011     * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
012     * details.
013     */
014    
015    package com.liferay.portal.util;
016    
017    import com.liferay.portal.kernel.security.pacl.DoPrivileged;
018    import com.liferay.portal.kernel.util.CharPool;
019    import com.liferay.portal.kernel.util.Html;
020    import com.liferay.portal.kernel.util.HttpUtil;
021    import com.liferay.portal.kernel.util.StringBundler;
022    import com.liferay.portal.kernel.util.StringPool;
023    import com.liferay.portal.kernel.util.StringUtil;
024    import com.liferay.portal.kernel.util.Validator;
025    
026    import java.util.regex.Matcher;
027    import java.util.regex.Pattern;
028    
029    import net.htmlparser.jericho.Renderer;
030    import net.htmlparser.jericho.Source;
031    import net.htmlparser.jericho.TextExtractor;
032    
033    /**
034     * @author Brian Wing Shun Chan
035     * @author Clarence Shen
036     * @author Harry Mark
037     * @author Samuel Kong
038     * @author Connor McKay
039     * @author Shuyang Zhou
040     */
041    @DoPrivileged
042    public class HtmlImpl implements Html {
043    
044            public static final int ESCAPE_MODE_ATTRIBUTE = 1;
045    
046            public static final int ESCAPE_MODE_CSS = 2;
047    
048            public static final int ESCAPE_MODE_JS = 3;
049    
050            public static final int ESCAPE_MODE_TEXT = 4;
051    
052            public static final int ESCAPE_MODE_URL = 5;
053    
054            public String escape(String text) {
055                    if (text == null) {
056                            return null;
057                    }
058    
059                    if (text.length() == 0) {
060                            return StringPool.BLANK;
061                    }
062    
063                    // Escape using XSS recommendations from
064                    // http://www.owasp.org/index.php/Cross_Site_Scripting
065                    // #How_to_Protect_Yourself
066    
067                    StringBundler sb = null;
068    
069                    int lastReplacementIndex = 0;
070    
071                    for (int i = 0; i < text.length(); i++) {
072                            char c = text.charAt(i);
073    
074                            String replacement = null;
075    
076                            switch (c) {
077                                    case '<':
078                                            replacement = "&lt;";
079    
080                                            break;
081    
082                                    case '>':
083                                            replacement = "&gt;";
084    
085                                            break;
086    
087                                    case '&':
088                                            replacement = "&amp;";
089    
090                                            break;
091    
092                                    case '"':
093                                            replacement = "&#034;";
094    
095                                            break;
096    
097                                    case '\'':
098                                            replacement = "&#039;";
099    
100                                            break;
101    
102                                    case '\u00bb': // '�'
103                                            replacement = "&#187;";
104    
105                                            break;
106    
107                                    case '\u2013':
108                                            replacement = "&#x2013;";
109    
110                                            break;
111    
112                                    case '\u2014':
113                                            replacement = "&#x2014;";
114    
115                                            break;
116                            }
117    
118                            if (replacement != null) {
119                                    if (sb == null) {
120                                            sb = new StringBundler();
121                                    }
122    
123                                    if (i > lastReplacementIndex) {
124                                            sb.append(text.substring(lastReplacementIndex, i));
125                                    }
126    
127                                    sb.append(replacement);
128    
129                                    lastReplacementIndex = i + 1;
130                            }
131                    }
132    
133                    if (sb == null) {
134                            return text;
135                    }
136                    else {
137                            if (lastReplacementIndex < text.length()) {
138                                    sb.append(text.substring(lastReplacementIndex));
139                            }
140    
141                            return sb.toString();
142                    }
143            }
144    
145            public String escape(String text, int type) {
146                    if (text == null) {
147                            return null;
148                    }
149    
150                    if (text.length() == 0) {
151                            return StringPool.BLANK;
152                    }
153    
154                    String prefix = StringPool.BLANK;
155                    String postfix = StringPool.BLANK;
156    
157                    if (type == ESCAPE_MODE_ATTRIBUTE) {
158                            prefix = "&#x";
159                            postfix = StringPool.SEMICOLON;
160                    }
161                    else if (type == ESCAPE_MODE_CSS) {
162                            prefix = StringPool.BACK_SLASH;
163                    }
164                    else if (type == ESCAPE_MODE_JS) {
165                            prefix = "\\x";
166                    }
167                    else if (type == ESCAPE_MODE_URL) {
168                            return HttpUtil.encodeURL(text, true);
169                    }
170                    else {
171                            return escape(text);
172                    }
173    
174                    StringBuilder sb = new StringBuilder();
175    
176                    for (int i = 0; i < text.length(); i++) {
177                            char c = text.charAt(i);
178    
179                            if (Character.isLetterOrDigit(c) ||
180                                    (c == CharPool.DASH) || (c == CharPool.UNDERLINE)) {
181    
182                                    sb.append(c);
183                            }
184                            else {
185                                    sb.append(prefix);
186    
187                                    String hexString = StringUtil.toHexString(c);
188    
189                                    if (hexString.length() == 1) {
190                                            sb.append(StringPool.ASCII_TABLE[48]);
191                                    }
192    
193                                    sb.append(hexString);
194                                    sb.append(postfix);
195                            }
196                    }
197    
198                    if (sb.length() == text.length()) {
199                            return text;
200                    }
201                    else {
202                            return sb.toString();
203                    }
204            }
205    
206            public String escapeAttribute(String attribute) {
207                    return escape(attribute, ESCAPE_MODE_ATTRIBUTE);
208            }
209    
210            public String escapeCSS(String css) {
211                    return escape(css, ESCAPE_MODE_CSS);
212            }
213    
214            public String escapeHREF(String href) {
215                    if (href == null) {
216                            return null;
217                    }
218    
219                    if (href.length() == 0) {
220                            return StringPool.BLANK;
221                    }
222    
223                    if (href.indexOf(StringPool.COLON) == 10) {
224                            String protocol = href.substring(0, 10).toLowerCase();
225    
226                            if (protocol.equals("javascript")) {
227                                    href = StringUtil.replaceFirst(href, StringPool.COLON, "%3a");
228                            }
229                    }
230    
231                    return escapeAttribute(href);
232            }
233    
234            public String escapeJS(String js) {
235                    return escape(js, ESCAPE_MODE_JS);
236            }
237    
238            public String escapeURL(String url) {
239                    return escape(url, ESCAPE_MODE_URL);
240            }
241    
242            public String escapeXPath(String xPath) {
243                    if (Validator.isNull(xPath)) {
244                            return xPath;
245                    }
246    
247                    StringBuilder sb = new StringBuilder(xPath.length());
248    
249                    for (int i = 0; i < xPath.length(); i++) {
250                            char c = xPath.charAt(i);
251    
252                            boolean hasToken = false;
253    
254                            for (int j = 0; j < _XPATH_TOKENS.length; j++) {
255                                    if (c == _XPATH_TOKENS[j]) {
256                                            hasToken = true;
257    
258                                            break;
259                                    }
260                            }
261    
262                            if (hasToken) {
263                                    sb.append(StringPool.UNDERLINE);
264                            }
265                            else {
266                                    sb.append(c);
267                            }
268                    }
269    
270                    return sb.toString();
271            }
272    
273            public String escapeXPathAttribute(String xPathAttribute) {
274                    boolean hasApostrophe = xPathAttribute.contains(StringPool.APOSTROPHE);
275                    boolean hasQuote = xPathAttribute.contains(StringPool.QUOTE);
276    
277                    if (hasQuote && hasApostrophe) {
278                            String[] parts = xPathAttribute.split(StringPool.APOSTROPHE);
279    
280                            return "concat('".concat(
281                                    StringUtil.merge(parts, "', \"'\", '")).concat("')");
282                    }
283    
284                    if (hasQuote) {
285                            return StringPool.APOSTROPHE.concat(xPathAttribute).concat(
286                                    StringPool.APOSTROPHE);
287                    }
288    
289                    return StringPool.QUOTE.concat(xPathAttribute).concat(StringPool.QUOTE);
290            }
291    
292            public String extractText(String html) {
293                    if (html == null) {
294                            return null;
295                    }
296    
297                    Source source = new Source(html);
298    
299                    TextExtractor textExtractor = source.getTextExtractor();
300    
301                    return textExtractor.toString();
302            }
303    
304            public String fromInputSafe(String text) {
305                    return StringUtil.replace(text, "&amp;", "&");
306            }
307    
308            public String render(String html) {
309                    if (html == null) {
310                            return null;
311                    }
312    
313                    Source source = new Source(html);
314    
315                    Renderer renderer = source.getRenderer();
316    
317                    return renderer.toString();
318            }
319    
320            public String replaceMsWordCharacters(String text) {
321                    return StringUtil.replace(text, _MS_WORD_UNICODE, _MS_WORD_HTML);
322            }
323    
324            public String stripBetween(String text, String tag) {
325                    return StringUtil.stripBetween(text, "<" + tag, "</" + tag + ">");
326            }
327    
328            public String stripComments(String text) {
329                    return StringUtil.stripBetween(text, "<!--", "-->");
330            }
331    
332            public String stripHtml(String text) {
333                    if (text == null) {
334                            return null;
335                    }
336    
337                    text = stripComments(text);
338    
339                    StringBuilder sb = new StringBuilder(text.length());
340    
341                    int x = 0;
342                    int y = text.indexOf("<");
343    
344                    while (y != -1) {
345                            sb.append(text.substring(x, y));
346                            sb.append(StringPool.SPACE);
347    
348                            // Look for text enclosed by <abc></abc>
349    
350                            if (isTag(_TAG_SCRIPT, text, y + 1)) {
351                                    y = stripTag(_TAG_SCRIPT, text, y);
352                            }
353                            else if (isTag(_TAG_STYLE, text, y + 1)) {
354                                    y = stripTag(_TAG_STYLE, text, y);
355                            }
356    
357                            x = text.indexOf(">", y);
358    
359                            if (x == -1) {
360                                    break;
361                            }
362    
363                            x++;
364    
365                            if (x < y) {
366    
367                                    // <b>Hello</b
368    
369                                    break;
370                            }
371    
372                            y = text.indexOf("<", x);
373                    }
374    
375                    if (y == -1) {
376                            sb.append(text.substring(x));
377                    }
378    
379                    return sb.toString();
380            }
381    
382            public String toInputSafe(String text) {
383                    return StringUtil.replace(
384                            text,
385                            new String[] {"&", "\""},
386                            new String[] {"&amp;", "&quot;"});
387            }
388    
389            public String unescape(String text) {
390                    if (text == null) {
391                            return null;
392                    }
393    
394                    if (text.length() == 0) {
395                            return StringPool.BLANK;
396                    }
397    
398                    // Optimize this
399    
400                    text = StringUtil.replace(text, "&lt;", "<");
401                    text = StringUtil.replace(text, "&gt;", ">");
402                    text = StringUtil.replace(text, "&amp;", "&");
403                    text = StringUtil.replace(text, "&#034;", "\"");
404                    text = StringUtil.replace(text, "&#039;", "'");
405                    text = StringUtil.replace(text, "&#040;", "(");
406                    text = StringUtil.replace(text, "&#041;", ")");
407                    text = StringUtil.replace(text, "&#044;", ",");
408                    text = StringUtil.replace(text, "&#035;", "#");
409                    text = StringUtil.replace(text, "&#037;", "%");
410                    text = StringUtil.replace(text, "&#059;", ";");
411                    text = StringUtil.replace(text, "&#061;", "=");
412                    text = StringUtil.replace(text, "&#043;", "+");
413                    text = StringUtil.replace(text, "&#045;", "-");
414    
415                    return text;
416            }
417    
418            public String unescapeCDATA(String text) {
419                    if (text == null) {
420                            return null;
421                    }
422    
423                    if (text.length() == 0) {
424                            return StringPool.BLANK;
425                    }
426    
427                    text = StringUtil.replace(text, "&lt;![CDATA[", "<![CDATA[");
428                    text = StringUtil.replace(text, "]]&gt;", "]]>");
429    
430                    return text;
431            }
432    
433            public String wordBreak(String text, int columns) {
434                    StringBundler sb = new StringBundler();
435    
436                    int length = 0;
437                    int lastWrite = 0;
438                    int pos = 0;
439    
440                    Pattern pattern = Pattern.compile("([\\s<&]|$)");
441    
442                    Matcher matcher = pattern.matcher(text);
443    
444                    while (matcher.find()) {
445                            if (matcher.start() < pos) {
446                                    continue;
447                            }
448    
449                            while ((length + matcher.start() - pos) >= columns) {
450                                    pos += columns - length;
451    
452                                    sb.append(text.substring(lastWrite, pos));
453                                    sb.append("<wbr/>&shy;");
454    
455                                    length = 0;
456                                    lastWrite = pos;
457                            }
458    
459                            length += matcher.start() - pos;
460    
461                            String group = matcher.group();
462    
463                            if (group.equals(StringPool.AMPERSAND)) {
464                                    int x = text.indexOf(StringPool.SEMICOLON, matcher.start());
465    
466                                    if (x != -1) {
467                                            length++;
468                                            pos = x + 1;
469                                    }
470    
471                                    continue;
472                            }
473    
474                            if (group.equals(StringPool.LESS_THAN)) {
475                                    int x = text.indexOf(StringPool.GREATER_THAN, matcher.start());
476    
477                                    if (x != -1) {
478                                            pos = x + 1;
479                                    }
480    
481                                    continue;
482                            }
483    
484                            if (group.equals(StringPool.SPACE) ||
485                                    group.equals(StringPool.NEW_LINE)) {
486    
487                                    length = 0;
488                                    pos = matcher.start() + 1;
489                            }
490                    }
491    
492                    sb.append(text.substring(lastWrite));
493    
494                    return sb.toString();
495            }
496    
497            protected boolean isTag(char[] tag, String text, int pos) {
498                    if ((pos + tag.length + 1) <= text.length()) {
499                            char item;
500    
501                            for (int i = 0; i < tag.length; i++) {
502                                    item = text.charAt(pos++);
503    
504                                    if (Character.toLowerCase(item) != tag[i]) {
505                                            return false;
506                                    }
507                            }
508    
509                            item = text.charAt(pos);
510    
511                            // Check that char after tag is not a letter (i.e. another tag)
512    
513                            return !Character.isLetter(item);
514                    }
515                    else {
516                            return false;
517                    }
518            }
519    
520            protected int stripTag(char[] tag, String text, int pos) {
521                    int x = pos + _TAG_SCRIPT.length;
522    
523                    // Find end of the tag
524    
525                    x = text.indexOf(">", x);
526    
527                    if (x >= 0) {
528    
529                            // Check if preceding character is / (i.e. is this instance of
530                            // <abc/>)
531    
532                            if (text.charAt(x-1) != '/') {
533    
534                                    // Search for the ending </abc> tag
535    
536                                    for (;;) {
537                                            x = text.indexOf("</", x);
538    
539                                            if (x >= 0) {
540                                                    if (isTag(tag, text, x + 2)) {
541                                                            pos = x;
542    
543                                                            break;
544                                                    }
545                                                    else {
546    
547                                                            // Skip past "</"
548    
549                                                            x += 2;
550                                                    }
551                                            }
552                                            else {
553                                                    break;
554                                            }
555                                    }
556                            }
557                    }
558    
559                    return pos;
560            }
561    
562            private static final String[] _MS_WORD_HTML = new String[] {
563                    "&reg;", StringPool.APOSTROPHE, StringPool.QUOTE, StringPool.QUOTE
564            };
565    
566            private static final String[] _MS_WORD_UNICODE = new String[] {
567                    "\u00ae", "\u2019", "\u201c", "\u201d"
568            };
569    
570            private static final char[] _TAG_SCRIPT = {'s', 'c', 'r', 'i', 'p', 't'};
571    
572            private static final char[] _TAG_STYLE = {'s', 't', 'y', 'l', 'e'};
573    
574            // See http://www.w3.org/TR/xpath20/#lexical-structure
575    
576            private static final char[] _XPATH_TOKENS = {
577                    '(', ')', '[', ']', '.', '@', ',', ':', '/', '|', '+', '-', '=', '!',
578                    '<', '>', '*', '$', '"', '"', ' ', 9, 10, 13, 133, 8232};
579    
580    }