comparison src/org/nwoca/ssdt/tools/html2wiki/Html2Wiki.java @ 7:a634b4d554d4

Minor fixups >, random smilies :), etc. Fixed blockquote. Handle escaping brackets outside pre tag.
author smith@nwoca.org
date Thu, 27 Jan 2011 18:07:28 -0500
parents 99f293bd507f
children e8ea26ab2cd7
comparison
equal deleted inserted replaced
6:99f293bd507f 7:a634b4d554d4
10 import java.util.Collection; 10 import java.util.Collection;
11 import java.util.ArrayList; 11 import java.util.ArrayList;
12 import java.util.List; 12 import java.util.List;
13 import org.apache.commons.io.FileUtils; 13 import org.apache.commons.io.FileUtils;
14 import java.util.regex.*; 14 import java.util.regex.*;
15 import org.apache.commons.io.FilenameUtils;
16 15
17 /** 16 /**
18 * Converter to convert HTML documents into MediaWiki test. 17 * Converter to convert HTML documents into MediaWiki test.
19 * 18 *
20 * Heavily customized to handle HTML produced by DEC DOCUMENT 19 * Heavily customized to handle HTML produced by DEC DOCUMENT
32 31
33 /** Creates a new instance of Html2Wiki. */ 32 /** Creates a new instance of Html2Wiki. */
34 public Html2Wiki(String html) { 33 public Html2Wiki(String html) {
35 buffer = new StringBuffer(html); 34 buffer = new StringBuffer(html);
36 transformers = new ArrayList<Transformer>(); 35 transformers = new ArrayList<Transformer>();
37 // transformers.add(new PreTagTransformer());
38 // transformers.add(new DeleteTransformer("^\\s",true));
39 transformers.add(new DeleteTransformer("<html>|</html>|<body>|</body>")); 36 transformers.add(new DeleteTransformer("<html>|</html>|<body>|</body>"));
40 transformers.add(new DeleteTransformer("<!--.*-->(\\n|\\r)*",true)); 37 transformers.add(new DeleteTransformer("<!--.*-->(\\n|\\r)*",true));
41 transformers.add(new DeleteTransformer("<a .*?>|</a>")); 38 transformers.add(new DeleteTransformer("<a .*?>|</a>"));
42 transformers.add(new DeleteTransformer("(?m)^\\*")); 39 transformers.add(new DeleteTransformer("(?m)^\\*"));
43 // transformers.add(new DeleteTransformer("<blockquote>|</blockquote>")); 40 // transformers.add(new DeleteTransformer("<blockquote>|</blockquote>"));
47 transformers.add(new BadTableDataTransformer()); 44 transformers.add(new BadTableDataTransformer());
48 transformers.add(new BadTableRowTransformer()); 45 transformers.add(new BadTableRowTransformer());
49 transformers.add(new ReflowTransformer()); 46 transformers.add(new ReflowTransformer());
50 transformers.add(new DeleteTransformer("<p>")); 47 transformers.add(new DeleteTransformer("<p>"));
51 // transformers.add(new ReplaceTransformer("</td>","\n</td>")); 48 // transformers.add(new ReplaceTransformer("</td>","\n</td>"));
52 transformers.add(new ReplaceTransformer("\\{","\\{")); 49 transformers.add(new ReplaceTransformer("\\{","\\{")); // Escape braces
53 transformers.add(new ReplaceTransformer("\\}","\\}")); 50 transformers.add(new ReplaceTransformer("\\}","\\}"));
54 // transformers.add(new ReplaceTransformer("\\[","\\[")); 51
55 // transformers.add(new ReplaceTransformer("\\]","\\]")); 52 transformers.add(new ReplaceTransformer("\\[","\\[")); // Escape brackets
53 transformers.add(new ReplaceTransformer("\\]","\\]"));
54 transformers.add(new PreTagTransformer()); // Unescape brackets inside <pre>
55 //
56 transformers.add(new ReplaceTransformer("<br>","\\\\")); 56 transformers.add(new ReplaceTransformer("<br>","\\\\"));
57 transformers.add(new ReplaceTransformer("<table.*?>|</table>","{table}")); 57 transformers.add(new ReplaceTransformer("<table.*?>|</table>","{table}"));
58 transformers.add(new ReplaceTransformer("<tr>|</tr>","{tr}")); 58 transformers.add(new ReplaceTransformer("<tr>|</tr>","{tr}"));
59 transformers.add(new ReplaceTransformer("<td.*?>|</td>","{td}")); 59 transformers.add(new ReplaceTransformer("<td.*?>|</td>","{td}"));
60 transformers.add(new ReplaceTransformer("<th.*?>|</th>","{th}")); 60 transformers.add(new ReplaceTransformer("<th.*?>|</th>","{th}"));
80 transformers.add(new ReplaceTransformer("\\{center}\\n\\{table}\\n\\{tr\\}\\n\\s{2}\\{td\\}\\{center\\}\\*Note\\*\\{center\\}","{note}")); 80 transformers.add(new ReplaceTransformer("\\{center}\\n\\{table}\\n\\{tr\\}\\n\\s{2}\\{td\\}\\{center\\}\\*Note\\*\\{center\\}","{note}"));
81 transformers.add(new ReplaceTransformer("\\{td\\}\\n\\s{2}\\{tr\\}\\n\\{table\\}\\n\\{center\\}","{note}")); 81 transformers.add(new ReplaceTransformer("\\{td\\}\\n\\s{2}\\{tr\\}\\n\\{table\\}\\n\\{center\\}","{note}"));
82 82
83 // transformers.add(new TagTransformer("\\{center}\\n\\{table}\\n\\{tr\\}\\n\\s{2}\\{td\\}\\{center\\}\\*Note\\*\\{center\\}(.*?)\\s\\{td\\}\\n\\s{2}\\{tr\\}\\{table\\}", "{note}", "{note}")); 83 // transformers.add(new TagTransformer("\\{center}\\n\\{table}\\n\\{tr\\}\\n\\s{2}\\{td\\}\\{center\\}\\*Note\\*\\{center\\}(.*?)\\s\\{td\\}\\n\\s{2}\\{tr\\}\\{table\\}", "{note}", "{note}"));
84 // transformers.add(new TagTransformer("(\\S)\\s\\n", "", " ")); 84 // transformers.add(new TagTransformer("(\\S)\\s\\n", "", " "));
85 transformers.add(new TagTransformer("<blockquote>(.*)</blockquote>", "{quote}", "{quote}")); 85 transformers.add(new TagTransformer("<blockquote>(.*?)</blockquote>", true, "{quote}", "{quote}"));
86 transformers.add(new DeleteTransformer("(?s)<hr.*?>")); 86 transformers.add(new DeleteTransformer("(?s)<hr.*?>"));
87 transformers.add(new ReflowTransformer("(\\{note\\})([^\\{]*)(\\{note\\})")); 87 transformers.add(new ReflowTransformer("(\\{note\\})([^\\{]*)(\\{note\\})"));
88 transformers.add(new TagTransformer("<sup>(.*?)</sup>", true, "^\\[","\\]^ "));
89 transformers.add(new ReplaceTransformer("&lt;","<"));
90 transformers.add(new ReplaceTransformer("&gt;",">"));
91 transformers.add(new ReplaceTransformer("&quot;","\""));
92 transformers.add(new ReplaceTransformer(":\\)",": )")); // No smilies...
88 93
89 } 94 }
90 95
91 /** 96 /**
92 * @param args the command line arguments 97 * @param args the command line arguments