001    /**
002     * Copyright (c) 2000-present Liferay, Inc. All rights reserved.
003     *
004     * This library is free software; you can redistribute it and/or modify it under
005     * the terms of the GNU Lesser General Public License as published by the Free
006     * Software Foundation; either version 2.1 of the License, or (at your option)
007     * any later version.
008     *
009     * This library is distributed in the hope that it will be useful, but WITHOUT
010     * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
011     * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
012     * details.
013     */
014    
015    package com.liferay.portlet.wiki.translators;
016    
017    import com.liferay.portal.kernel.util.CharPool;
018    import com.liferay.portal.kernel.util.StringPool;
019    import com.liferay.portal.kernel.util.StringUtil;
020    import com.liferay.portlet.wiki.importers.mediawiki.MediaWikiImporter;
021    
022    import java.util.regex.Matcher;
023    import java.util.regex.Pattern;
024    
025    /**
026     * @author Jorge Ferrer
027     * @author Daniel Kocsis
028     */
029    public class MediaWikiToCreoleTranslator extends BaseTranslator {
030    
031            public static final String TABLE_OF_CONTENTS = "<<TableOfContents>>\n\n";
032    
033            public MediaWikiToCreoleTranslator() {
034                    initRegexps();
035                    initNowikiRegexps();
036            }
037    
038            public boolean isStrictImportMode() {
039                    return _strictImportMode;
040            }
041    
042            public void setStrictImportMode(boolean strictImportMode) {
043                    _strictImportMode = strictImportMode;
044            }
045    
046            protected void initNowikiRegexps() {
047    
048                    // Preformat protected
049    
050                    nowikiRegexps.add("(<nowiki>)(.*?)(</nowiki>)");
051                    nowikiRegexps.add("(<pre>)(.*?)(</pre>)");
052    
053                    // Escape protected
054    
055                    nowikiRegexps.add(
056                            "~(\\*\\*|~|//|-|#|\\{\\{|}}|\\\\|~\\[~~[|]]|----|=|\\|)");
057            }
058    
059            protected void initRegexps() {
060    
061                    // Clean unnecessary header emphasis
062    
063                    regexps.put("= '''([^=]+)''' =", "= $1 =");
064                    regexps.put("== '''([^=]+)''' ==", "== $1 ==");
065                    regexps.put("== '''([^=]+)''' ===", "=== $1 ===");
066    
067                    // Unscape angle brackets
068    
069                    regexps.put("&lt;", "<");
070                    regexps.put("&gt;", ">");
071    
072                    // Remove categories
073    
074                    regexps.put("\\[\\[[Cc]ategory:([^\\]]*)\\]\\][\\n]*", "");
075    
076                    // Remove disambiguations
077    
078                    regexps.put("\\{{2}OtherTopics\\|([^\\}]*)\\}{2}", StringPool.BLANK);
079    
080                    // Remove work in progress
081    
082                    regexps.put("\\{{2}Work in progress\\}{2}", StringPool.BLANK);
083    
084                    // Remove references
085    
086                    regexps.put("\\[{2}Wikipedia:([^\\]]*)\\]{2}", StringPool.BLANK);
087    
088                    // Bold and italics
089    
090                    regexps.put(
091                            "''''((?s:.)*?)(''''|(\n\n|\r\r|\r\n\r\n))", "**//$1//**$3");
092    
093                    // Bold
094    
095                    regexps.put("'''((?s:.)*?)('''|(\n\n|\r\r|\r\n\r\n))", "**$1**$3");
096    
097                    // Italics
098    
099                    regexps.put("''((?s:.)*?)(''|(\n\n|\r\r|\r\n\r\n))", "//$1//$3");
100    
101                    // Normalize URLs
102    
103                    regexps.put("\\[{2}((http|ftp)[^ ]*) ([^\\]]*)\\]{2}", "[$1 $3]");
104    
105                    // URL
106    
107                    regexps.put("\\[((http|ftp)[^ ]*)\\]", "[[$1]]");
108    
109                    // URL with label
110    
111                    regexps.put("\\[((http|ftp)[^ ]*) ([^\\]]*)\\]", "[[$1|$3]]");
112    
113                    // Term and definition
114    
115                    regexps.put("^\\t([\\w]+):\\t(.*)", "**$1**:\n$2");
116    
117                    // Indented paragraph
118    
119                    regexps.put("^\\t:\\t(.*)", "$1");
120    
121                    // Monospace
122    
123                    regexps.put("(^ (.+))(\\n (.+))*", "{{{\n$0\n}}}");
124    
125                    // No wiki
126    
127                    regexps.put("<nowiki>([^<]*)</nowiki>", "{{{$1}}}");
128    
129                    // HTML PRE
130    
131                    regexps.put("<pre>([^<]*)</pre>", "{{{$1}}}");
132    
133                    // User reference
134    
135                    regexps.put("[-]*\\[{2}User:([^\\]]*)\\]{2}", "$1");
136            }
137    
138            @Override
139            protected String postProcess(String content) {
140                    if (_strictImportMode) {
141                            content = runRegexp(
142                                    content, "\\{{2}Special:(.*?)\\}{2}", StringPool.BLANK);
143                            content = runRegexp(content, "\\{{2}(.*?)\\}{2}", StringPool.BLANK);
144                            content = runRegexp(
145                                    content, "(?s)\\{{2}(.*?)\\}{2}", StringPool.BLANK);
146                    }
147                    else {
148                            content = runRegexp(
149                                    content, "\\{{2}Special:(.*?)\\}{2}", "{{{$1}}}\n");
150                            content = runRegexp(content, "\\{{2}(.*?)\\}{2}", "{{{$1}}}");
151                            content = runRegexp(
152                                    content, "([^\\{])(\\{{2})([^\\{])", "$1\n{{{\n$3");
153                            content = runRegexp(
154                                    content, "([^\\}])(\\}{2})([^\\}])", "$1\n}}}\n$3");
155                    }
156    
157                    // LEP-6118
158    
159                    Matcher matcher = _titlePattern.matcher(content);
160    
161                    if (matcher.find()) {
162                            content = runRegexp(content, "^===([^=]+)===", "====$1====");
163                            content = runRegexp(content, "^==([^=]+)==", "===$1===");
164                            content = runRegexp(content, "^=([^=]+)=", "==$1==");
165                    }
166    
167                    // Remove HTML tags
168    
169                    for (Pattern pattern : _htmlTagPatterns) {
170                            matcher = pattern.matcher(content);
171    
172                            content = matcher.replaceAll(StringPool.BLANK);
173                    }
174    
175                    for (String htmlTag : _HTML_TAGS) {
176                            content = StringUtil.replace(content, htmlTag, StringPool.BLANK);
177                    }
178    
179                    // Images
180    
181                    matcher = _imagePattern.matcher(content);
182    
183                    StringBuffer sb = new StringBuffer(content);
184    
185                    int level = 0;
186                    int offset = 0;
187                    int originalLength = 0;
188                    int prefixLength = 0;
189    
190                    while (matcher.find()) {
191                            level = 0;
192                            prefixLength = matcher.end(2) - matcher.start(2);
193    
194                            for (int i = matcher.start(0) + offset; i < sb.length() - 1; i++) {
195                                    if ((sb.charAt(i) == '[') && (sb.charAt(i + 1) == '[')) {
196                                            level++;
197                                    }
198                                    else if ((sb.charAt(i) == ']') && (sb.charAt(i + 1) == ']')) {
199                                            level--;
200    
201                                            if (level == 0) {
202                                                    originalLength = (i + 2) - (matcher.start(0) + offset);
203    
204                                                    break;
205                                            }
206                                    }
207                            }
208    
209                            int imageStartPos = matcher.end(3) + offset;
210                            int imageEndPos = matcher.start(2) + offset + originalLength - 4;
211    
212                            String image =
213                                    "{{" + MediaWikiImporter.SHARED_IMAGES_TITLE + "/" +
214                                            StringUtil.toLowerCase(
215                                                    sb.substring(imageStartPos, imageEndPos)) + "}}";
216    
217                            int imageLength = image.length();
218    
219                            image = StringUtil.replace(image, "[[", StringPool.BLANK);
220                            image = StringUtil.replace(image, "]]", StringPool.BLANK);
221    
222                            sb.replace(
223                                    matcher.start(0) + offset,
224                                    matcher.start(0) + originalLength + offset, image);
225    
226                            offset +=
227                                    MediaWikiImporter.SHARED_IMAGES_TITLE.length() - prefixLength -
228                                            (imageLength - image.length());
229                    }
230    
231                    content = sb.toString();
232    
233                    // Tables
234    
235                    matcher = _tablePattern.matcher(content);
236    
237                    sb = new StringBuffer(content);
238    
239                    String mediaWikiTable = null;
240    
241                    offset = 0;
242                    originalLength = 0;
243    
244                    while (matcher.find()) {
245                            mediaWikiTable = sb.substring(
246                                    matcher.start(1) + offset, matcher.end(1) + offset);
247    
248                            originalLength = mediaWikiTable.length() + 4;
249    
250                            Matcher matcher1 = _mediaWikiTablePattern1.matcher(mediaWikiTable);
251    
252                            mediaWikiTable = matcher1.replaceAll(StringPool.BLANK);
253    
254                            Matcher matcher2 = _mediaWikiTablePattern2.matcher(mediaWikiTable);
255    
256                            mediaWikiTable = matcher2.replaceAll("$1");
257    
258                            Matcher matcher3 = _mediaWikiTablePattern3.matcher(mediaWikiTable);
259    
260                            mediaWikiTable = matcher3.replaceAll("===$1===");
261    
262                            Matcher matcher4 = _mediaWikiTablePattern4.matcher(mediaWikiTable);
263    
264                            mediaWikiTable = matcher4.replaceAll("|=$1|");
265    
266                            mediaWikiTable = StringUtil.replace(
267                                    mediaWikiTable, CharPool.NEW_LINE, StringPool.BLANK);
268                            mediaWikiTable = StringUtil.replace(
269                                    mediaWikiTable, CharPool.RETURN, StringPool.BLANK);
270                            mediaWikiTable = StringUtil.replace(mediaWikiTable, "|-", "\n\r");
271                            mediaWikiTable = StringUtil.replace(mediaWikiTable, "||", "|");
272                            mediaWikiTable = StringUtil.replace(
273                                    mediaWikiTable, "////", StringPool.BLANK);
274    
275                            sb.replace(
276                                    matcher.start(0) + offset,
277                                    matcher.start(0) + originalLength + offset, mediaWikiTable);
278    
279                            offset += mediaWikiTable.length() - originalLength;
280                    }
281    
282                    content = sb.toString();
283    
284                    content = runRegexp(content, "/{2}(\\{{3})", "$1");
285                    content = runRegexp(content, "(\\}{3})/{2}", "$1");
286    
287                    // Remove underscores from links
288    
289                    matcher = _linkPattern.matcher(content);
290    
291                    sb = new StringBuffer(content);
292    
293                    while (matcher.find()) {
294                            String link = matcher.group(1).replace(
295                                    StringPool.UNDERLINE, StringPool.SPACE);
296    
297                            sb.replace(matcher.start(1), matcher.end(1), link);
298                    }
299    
300                    return TABLE_OF_CONTENTS + super.postProcess(sb.toString());
301            }
302    
303            private static final String[] _HTML_TAGS = {
304                    "<blockquote>", "</blockquote>", "<br>", "<br/>", "<br />", "<center>",
305                    "</center>", "<cite>", "</cite>","<code>", "</code>", "</div>",
306                    "</font>", "<hr>", "<hr/>", "<hr />", "<p>", "</p>", "<tt>", "</tt>",
307                    "<var>", "</var>"
308            };
309    
310            private Pattern[] _htmlTagPatterns = {
311                    Pattern.compile("<div[^>]*>"), Pattern.compile("<font[^>]*>")};
312            private Pattern _imagePattern = Pattern.compile(
313                    "(\\[{2})(Image|File)(:)", Pattern.DOTALL);
314            private Pattern _linkPattern = Pattern.compile(
315                    "\\[{2}([^\\]]*)\\]{2}", Pattern.DOTALL);
316            private Pattern _mediaWikiTablePattern1 = Pattern.compile(
317                    "class=(.*?)[|\n\r]");
318            private Pattern _mediaWikiTablePattern2 = Pattern.compile("(\\|\\-)(.*)");
319            private Pattern _mediaWikiTablePattern3 = Pattern.compile("\\|\\+(.*)");
320            private Pattern _mediaWikiTablePattern4 = Pattern.compile("(?m)^!(.+)");
321            private boolean _strictImportMode;
322            private Pattern _tablePattern = Pattern.compile(
323                    "\\{\\|(.*?)\\|\\}", Pattern.DOTALL);
324            private Pattern _titlePattern = Pattern.compile(
325                    "^=([^=]+)=", Pattern.MULTILINE);
326    
327    }