001 /** 002 * Copyright (c) 2000-present Liferay, Inc. All rights reserved. 003 * 004 * This library is free software; you can redistribute it and/or modify it under 005 * the terms of the GNU Lesser General Public License as published by the Free 006 * Software Foundation; either version 2.1 of the License, or (at your option) 007 * any later version. 008 * 009 * This library is distributed in the hope that it will be useful, but WITHOUT 010 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 011 * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 012 * details. 013 */ 014 015 package com.liferay.portlet.wiki.translators; 016 017 import com.liferay.portal.kernel.util.CharPool; 018 import com.liferay.portal.kernel.util.StringPool; 019 import com.liferay.portal.kernel.util.StringUtil; 020 import com.liferay.portlet.wiki.importers.mediawiki.MediaWikiImporter; 021 022 import java.util.regex.Matcher; 023 import java.util.regex.Pattern; 024 025 /** 026 * @author Jorge Ferrer 027 * @author Daniel Kocsis 028 */ 029 public class MediaWikiToCreoleTranslator extends BaseTranslator { 030 031 public static final String TABLE_OF_CONTENTS = "<<TableOfContents>>\n\n"; 032 033 public MediaWikiToCreoleTranslator() { 034 initRegexps(); 035 initNowikiRegexps(); 036 } 037 038 public boolean isStrictImportMode() { 039 return _strictImportMode; 040 } 041 042 public void setStrictImportMode(boolean strictImportMode) { 043 _strictImportMode = strictImportMode; 044 } 045 046 protected void initNowikiRegexps() { 047 048 // Preformat protected 049 050 nowikiRegexps.add("(<nowiki>)(.*?)(</nowiki>)"); 051 nowikiRegexps.add("(<pre>)(.*?)(</pre>)"); 052 053 // Escape protected 054 055 nowikiRegexps.add( 056 "~(\\*\\*|~|//|-|#|\\{\\{|}}|\\\\|~\\[~~[|]]|----|=|\\|)"); 057 } 058 059 protected void initRegexps() { 060 061 // Clean unnecessary header emphasis 062 063 regexps.put("= '''([^=]+)''' =", "= $1 ="); 064 regexps.put("== '''([^=]+)''' ==", "== $1 =="); 065 regexps.put("== '''([^=]+)''' ===", "=== $1 ==="); 066 067 // Unscape angle brackets 068 069 regexps.put("<", "<"); 070 regexps.put(">", ">"); 071 072 // Remove categories 073 074 regexps.put("\\[\\[[Cc]ategory:([^\\]]*)\\]\\][\\n]*", ""); 075 076 // Remove disambiguations 077 078 regexps.put("\\{{2}OtherTopics\\|([^\\}]*)\\}{2}", StringPool.BLANK); 079 080 // Remove work in progress 081 082 regexps.put("\\{{2}Work in progress\\}{2}", StringPool.BLANK); 083 084 // Remove references 085 086 regexps.put("\\[{2}Wikipedia:([^\\]]*)\\]{2}", StringPool.BLANK); 087 088 // Bold and italics 089 090 regexps.put( 091 "''''((?s:.)*?)(''''|(\n\n|\r\r|\r\n\r\n))", "**//$1//**$3"); 092 093 // Bold 094 095 regexps.put("'''((?s:.)*?)('''|(\n\n|\r\r|\r\n\r\n))", "**$1**$3"); 096 097 // Italics 098 099 regexps.put("''((?s:.)*?)(''|(\n\n|\r\r|\r\n\r\n))", "//$1//$3"); 100 101 // Normalize URLs 102 103 regexps.put("\\[{2}((http|ftp)[^ ]*) ([^\\]]*)\\]{2}", "[$1 $3]"); 104 105 // URL 106 107 regexps.put("\\[((http|ftp)[^ ]*)\\]", "[[$1]]"); 108 109 // URL with label 110 111 regexps.put("\\[((http|ftp)[^ ]*) ([^\\]]*)\\]", "[[$1|$3]]"); 112 113 // Term and definition 114 115 regexps.put("^\\t([\\w]+):\\t(.*)", "**$1**:\n$2"); 116 117 // Indented paragraph 118 119 regexps.put("^\\t:\\t(.*)", "$1"); 120 121 // Monospace 122 123 regexps.put("(^ (.+))(\\n (.+))*", "{{{\n$0\n}}}"); 124 125 // No wiki 126 127 regexps.put("<nowiki>([^<]*)</nowiki>", "{{{$1}}}"); 128 129 // HTML PRE 130 131 regexps.put("<pre>([^<]*)</pre>", "{{{$1}}}"); 132 133 // User reference 134 135 regexps.put("[-]*\\[{2}User:([^\\]]*)\\]{2}", "$1"); 136 } 137 138 @Override 139 protected String postProcess(String content) { 140 if (_strictImportMode) { 141 content = runRegexp( 142 content, "\\{{2}Special:(.*?)\\}{2}", StringPool.BLANK); 143 content = runRegexp(content, "\\{{2}(.*?)\\}{2}", StringPool.BLANK); 144 content = runRegexp( 145 content, "(?s)\\{{2}(.*?)\\}{2}", StringPool.BLANK); 146 } 147 else { 148 content = runRegexp( 149 content, "\\{{2}Special:(.*?)\\}{2}", "{{{$1}}}\n"); 150 content = runRegexp(content, "\\{{2}(.*?)\\}{2}", "{{{$1}}}"); 151 content = runRegexp( 152 content, "([^\\{])(\\{{2})([^\\{])", "$1\n{{{\n$3"); 153 content = runRegexp( 154 content, "([^\\}])(\\}{2})([^\\}])", "$1\n}}}\n$3"); 155 } 156 157 // LEP-6118 158 159 Matcher matcher = _titlePattern.matcher(content); 160 161 if (matcher.find()) { 162 content = runRegexp(content, "^===([^=]+)===", "====$1===="); 163 content = runRegexp(content, "^==([^=]+)==", "===$1==="); 164 content = runRegexp(content, "^=([^=]+)=", "==$1=="); 165 } 166 167 // Remove HTML tags 168 169 for (Pattern pattern : _htmlTagPatterns) { 170 matcher = pattern.matcher(content); 171 172 content = matcher.replaceAll(StringPool.BLANK); 173 } 174 175 for (String htmlTag : _HTML_TAGS) { 176 content = StringUtil.replace(content, htmlTag, StringPool.BLANK); 177 } 178 179 // Images 180 181 matcher = _imagePattern.matcher(content); 182 183 StringBuffer sb = new StringBuffer(content); 184 185 int level = 0; 186 int offset = 0; 187 int originalLength = 0; 188 int prefixLength = 0; 189 190 while (matcher.find()) { 191 level = 0; 192 prefixLength = matcher.end(2) - matcher.start(2); 193 194 for (int i = matcher.start(0) + offset; i < sb.length() - 1; i++) { 195 if ((sb.charAt(i) == '[') && (sb.charAt(i + 1) == '[')) { 196 level++; 197 } 198 else if ((sb.charAt(i) == ']') && (sb.charAt(i + 1) == ']')) { 199 level--; 200 201 if (level == 0) { 202 originalLength = (i + 2) - (matcher.start(0) + offset); 203 204 break; 205 } 206 } 207 } 208 209 int imageStartPos = matcher.end(3) + offset; 210 int imageEndPos = matcher.start(2) + offset + originalLength - 4; 211 212 String image = 213 "{{" + MediaWikiImporter.SHARED_IMAGES_TITLE + "/" + 214 StringUtil.toLowerCase( 215 sb.substring(imageStartPos, imageEndPos)) + "}}"; 216 217 int imageLength = image.length(); 218 219 image = StringUtil.replace(image, "[[", StringPool.BLANK); 220 image = StringUtil.replace(image, "]]", StringPool.BLANK); 221 222 sb.replace( 223 matcher.start(0) + offset, 224 matcher.start(0) + originalLength + offset, image); 225 226 offset += 227 MediaWikiImporter.SHARED_IMAGES_TITLE.length() - prefixLength - 228 (imageLength - image.length()); 229 } 230 231 content = sb.toString(); 232 233 // Tables 234 235 matcher = _tablePattern.matcher(content); 236 237 sb = new StringBuffer(content); 238 239 String mediaWikiTable = null; 240 241 offset = 0; 242 originalLength = 0; 243 244 while (matcher.find()) { 245 mediaWikiTable = sb.substring( 246 matcher.start(1) + offset, matcher.end(1) + offset); 247 248 originalLength = mediaWikiTable.length() + 4; 249 250 Matcher matcher1 = _mediaWikiTablePattern1.matcher(mediaWikiTable); 251 252 mediaWikiTable = matcher1.replaceAll(StringPool.BLANK); 253 254 Matcher matcher2 = _mediaWikiTablePattern2.matcher(mediaWikiTable); 255 256 mediaWikiTable = matcher2.replaceAll("$1"); 257 258 Matcher matcher3 = _mediaWikiTablePattern3.matcher(mediaWikiTable); 259 260 mediaWikiTable = matcher3.replaceAll("===$1==="); 261 262 Matcher matcher4 = _mediaWikiTablePattern4.matcher(mediaWikiTable); 263 264 mediaWikiTable = matcher4.replaceAll("|=$1|"); 265 266 mediaWikiTable = StringUtil.replace( 267 mediaWikiTable, CharPool.NEW_LINE, StringPool.BLANK); 268 mediaWikiTable = StringUtil.replace( 269 mediaWikiTable, CharPool.RETURN, StringPool.BLANK); 270 mediaWikiTable = StringUtil.replace(mediaWikiTable, "|-", "\n\r"); 271 mediaWikiTable = StringUtil.replace(mediaWikiTable, "||", "|"); 272 mediaWikiTable = StringUtil.replace( 273 mediaWikiTable, "////", StringPool.BLANK); 274 275 sb.replace( 276 matcher.start(0) + offset, 277 matcher.start(0) + originalLength + offset, mediaWikiTable); 278 279 offset += mediaWikiTable.length() - originalLength; 280 } 281 282 content = sb.toString(); 283 284 content = runRegexp(content, "/{2}(\\{{3})", "$1"); 285 content = runRegexp(content, "(\\}{3})/{2}", "$1"); 286 287 // Remove underscores from links 288 289 matcher = _linkPattern.matcher(content); 290 291 sb = new StringBuffer(content); 292 293 while (matcher.find()) { 294 String link = matcher.group(1).replace( 295 StringPool.UNDERLINE, StringPool.SPACE); 296 297 sb.replace(matcher.start(1), matcher.end(1), link); 298 } 299 300 return TABLE_OF_CONTENTS + super.postProcess(sb.toString()); 301 } 302 303 private static final String[] _HTML_TAGS = { 304 "<blockquote>", "</blockquote>", "<br>", "<br/>", "<br />", "<center>", 305 "</center>", "<cite>", "</cite>","<code>", "</code>", "</div>", 306 "</font>", "<hr>", "<hr/>", "<hr />", "<p>", "</p>", "<tt>", "</tt>", 307 "<var>", "</var>" 308 }; 309 310 private Pattern[] _htmlTagPatterns = { 311 Pattern.compile("<div[^>]*>"), Pattern.compile("<font[^>]*>")}; 312 private Pattern _imagePattern = Pattern.compile( 313 "(\\[{2})(Image|File)(:)", Pattern.DOTALL); 314 private Pattern _linkPattern = Pattern.compile( 315 "\\[{2}([^\\]]*)\\]{2}", Pattern.DOTALL); 316 private Pattern _mediaWikiTablePattern1 = Pattern.compile( 317 "class=(.*?)[|\n\r]"); 318 private Pattern _mediaWikiTablePattern2 = Pattern.compile("(\\|\\-)(.*)"); 319 private Pattern _mediaWikiTablePattern3 = Pattern.compile("\\|\\+(.*)"); 320 private Pattern _mediaWikiTablePattern4 = Pattern.compile("(?m)^!(.+)"); 321 private boolean _strictImportMode; 322 private Pattern _tablePattern = Pattern.compile( 323 "\\{\\|(.*?)\\|\\}", Pattern.DOTALL); 324 private Pattern _titlePattern = Pattern.compile( 325 "^=([^=]+)=", Pattern.MULTILINE); 326 327 }