001    /**
002     * Copyright (c) 2000-2012 Liferay, Inc. All rights reserved.
003     *
004     * This library is free software; you can redistribute it and/or modify it under
005     * the terms of the GNU Lesser General Public License as published by the Free
006     * Software Foundation; either version 2.1 of the License, or (at your option)
007     * any later version.
008     *
009     * This library is distributed in the hope that it will be useful, but WITHOUT
010     * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
011     * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
012     * details.
013     */
014    
015    package com.liferay.portlet.wiki.translators;
016    
017    import com.liferay.portal.kernel.util.StringPool;
018    import com.liferay.portlet.wiki.importers.mediawiki.MediaWikiImporter;
019    
020    import java.util.regex.Matcher;
021    import java.util.regex.Pattern;
022    
023    /**
024     * @author Jorge Ferrer
025     */
026    public class MediaWikiToCreoleTranslator extends BaseTranslator {
027    
028            public static final String TABLE_OF_CONTENTS = "<<TableOfContents>>\n\n";
029    
030            public MediaWikiToCreoleTranslator() {
031                    initRegexps();
032                    initNowikiRegexps();
033            }
034    
035            protected void initNowikiRegexps() {
036    
037                    // Preformat protected
038    
039                    nowikiRegexps.add("(<nowiki>)(.*?)(</nowiki>)");
040                    nowikiRegexps.add("(<pre>)(.*?)(</pre>)");
041    
042                    // Escape protected
043    
044                    nowikiRegexps.add(
045                            "~(\\*\\*|~|//|-|#|\\{\\{|}}|\\\\|~\\[~~[|]]|----|=|\\|)");
046            }
047    
048            protected void initRegexps() {
049    
050                    // Clean unnecessary header emphasis
051    
052                    regexps.put("= '''([^=]+)''' =", "= $1 =");
053                    regexps.put("== '''([^=]+)''' ==", "== $1 ==");
054                    regexps.put("== '''([^=]+)''' ===", "=== $1 ===");
055    
056                    // Unscape angle brackets
057    
058                    regexps.put("&lt;", "<");
059                    regexps.put("&gt;", ">");
060    
061                    // Remove categories
062    
063                    regexps.put("\\[\\[[Cc]ategory:([^\\]]*)\\]\\][\\n]*", "");
064    
065                    // Remove disambiguations
066    
067                    regexps.put("\\{{2}OtherTopics\\|([^\\}]*)\\}{2}", StringPool.BLANK);
068    
069                    // Remove work in progress
070    
071                    regexps.put("\\{{2}Work in progress\\}{2}", StringPool.BLANK);
072    
073                    // Bold and italics
074    
075                    regexps.put(
076                            "''''((?s:.)*?)(''''|(\n\n|\r\r|\r\n\r\n))", "**//$1//**$3");
077    
078                    // Bold
079    
080                    regexps.put("'''((?s:.)*?)('''|(\n\n|\r\r|\r\n\r\n))", "**$1**$3");
081    
082                    // Italics
083    
084                    regexps.put("''((?s:.)*?)(''|(\n\n|\r\r|\r\n\r\n))", "//$1//$3");
085    
086                    // Normalize URLs
087    
088                    regexps.put("\\[{2}((http|ftp)[^ ]*) ([^\\]]*)\\]{2}", "[$1 $3]");
089    
090                    // URL
091    
092                    regexps.put("\\[((http|ftp)[^ ]*)\\]", "[[$1]]");
093    
094                    // URL with label
095    
096                    regexps.put("\\[((http|ftp)[^ ]*) ([^\\]]*)\\]", "[[$1|$3]]");
097    
098                    // Term and definition
099    
100                    regexps.put("^\\t([\\w]+):\\t(.*)", "**$1**:\n$2");
101    
102                    // Indented paragraph
103    
104                    regexps.put("^\\t:\\t(.*)", "$1");
105    
106                    // Monospace
107    
108                    regexps.put("(^ (.+))(\\n (.+))*", "{{{\n$0\n}}}");
109    
110                    // No wiki
111    
112                    regexps.put("<nowiki>([^<]*)</nowiki>", "{{{$1}}}");
113    
114                    // HTML PRE
115    
116                    regexps.put("<pre>([^<]*)</pre>", "{{{$1}}}");
117    
118                    // User reference
119    
120                    regexps.put("[-]*\\[{2}User:([^\\]]*)\\]{2}", "$1");
121            }
122    
123            @Override
124            protected String postProcess(String content) {
125    
126                    // LEP-6118
127    
128                    Matcher matcher = Pattern.compile(
129                            "^=([^=]+)=", Pattern.MULTILINE).matcher(content);
130    
131                    if (matcher.find()) {
132                            content = runRegexp(content, "^===([^=]+)===", "====$1====");
133                            content = runRegexp(content, "^==([^=]+)==", "===$1===");
134                            content = runRegexp(content, "^=([^=]+)=", "==$1==");
135                    }
136    
137                    // Remove HTML tags
138    
139                    for (int i = 0; i < _HTML_TAGS.length; i++) {
140                            content = content.replaceAll(_HTML_TAGS[i], StringPool.BLANK);
141                    }
142    
143                    // Images
144    
145                    matcher = Pattern.compile(
146                            "\\[{2}Image:([^\\]]*)\\]{2}", Pattern.DOTALL).matcher(content);
147    
148                    StringBuffer sb = new StringBuffer(content);
149    
150                    int offset = 0;
151    
152                    while (matcher.find()) {
153                            String image =
154                                    "{{" + MediaWikiImporter.SHARED_IMAGES_TITLE + "/" +
155                                            matcher.group(1).toLowerCase() + "}}";
156    
157                            sb.replace(
158                                    matcher.start(0) + offset, matcher.end(0) + offset, image);
159    
160                            offset += MediaWikiImporter.SHARED_IMAGES_TITLE.length() - 5;
161                    }
162    
163                    content = sb.toString();
164    
165                    // Remove underscores from links
166    
167                    matcher = Pattern.compile(
168                            "\\[{2}([^\\]]*)\\]{2}", Pattern.DOTALL).matcher(content);
169    
170                    sb = new StringBuffer(content);
171    
172                    while (matcher.find()) {
173                            String link = matcher.group(1).replace(
174                                    StringPool.UNDERLINE, StringPool.SPACE);
175    
176                            sb.replace(matcher.start(1), matcher.end(1), link);
177                    }
178    
179                    return TABLE_OF_CONTENTS + super.postProcess(sb.toString());
180            }
181    
182            private static final String[] _HTML_TAGS = {
183                    "<blockquote>", "</blockquote>", "<br>", "<br/>", "<br />", "<center>",
184                    "</center>", "<cite>", "</cite>","<code>", "</code>", "<div[^>]*>",
185                    "</div>", "<font[^>]*>", "</font>", "<hr>", "<hr/>", "<hr />", "<p>",
186                    "</p>", "<tt>", "</tt>", "<var>", "</var>"};
187    
188    }