annotate src/org/nwoca/ssdt/tools/html2wiki/Html2Wiki.java @ 9:ccb40d1cb213

[no commit message]
author ferrall@nwoca.org
date Fri, 28 Jan 2011 09:07:05 -0500
parents e8ea26ab2cd7
children 2fb5084b1564
rev   line source
0
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
1 package org.nwoca.ssdt.tools.html2wiki;
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
2 /*
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
3 * Html2Wiki.java
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
4 *
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
5 * Created on May 9, 2006, 3:22 PM
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
6 *
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
7 */
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
8
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
9 import java.io.*;
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
10 import java.util.Collection;
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
11 import java.util.ArrayList;
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
12 import java.util.List;
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
13 import org.apache.commons.io.FileUtils;
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
14 import java.util.regex.*;
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
15
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
16 /**
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
17 * Converter to convert HTML documents into MediaWiki test.
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
18 *
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
19 * Heavily customized to handle HTML produced by DEC DOCUMENT
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
20 * SOFTARE doctype. Breaks file into Chapters in the manner done
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
21 * by Document. Needs modification to work with other HTML files.
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
22 *
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
23 * @author SMITH
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
24 */
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
25 public class Html2Wiki {
6
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
26
0
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
27 private StringBuffer buffer;
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
28 private Collection<Transformer> transformers;
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
29 private boolean converted = false;
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
30 private static String category;
6
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
31
0
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
32 /** Creates a new instance of Html2Wiki. */
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
33 public Html2Wiki(String html) {
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
34 buffer = new StringBuffer(html);
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
35 transformers = new ArrayList<Transformer>();
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
36 transformers.add(new DeleteTransformer("<html>|</html>|<body>|</body>"));
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
37 transformers.add(new DeleteTransformer("<!--.*-->(\\n|\\r)*",true));
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
38 transformers.add(new DeleteTransformer("<a .*?>|</a>"));
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
39 transformers.add(new DeleteTransformer("(?m)^\\*"));
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
40 transformers.add(new DeleteTransformer("(?m)<br>$"));
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
41 transformers.add(new DeleteTransformer("<font .*?>|</font>"));
4
22ed6d93442c Start modifying transformers to Confluence wiki syntax
smith@nwoca.org
parents: 2
diff changeset
42 transformers.add(new CloseTagTransformer("<li>","(\n|\r)*(<li>|</ul>|</ol>|<ul>|<ol>)","</li>"));
0
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
43 transformers.add(new BadTableDataTransformer());
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
44 transformers.add(new BadTableRowTransformer());
6
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
45 transformers.add(new ReflowTransformer());
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
46 transformers.add(new DeleteTransformer("<p>"));
8
e8ea26ab2cd7 [no commit message]
ferrall@nwoca.org
parents: 7
diff changeset
47 transformers.add(new ReplaceTransformer("\\{","\\{")); // Escape braces
e8ea26ab2cd7 [no commit message]
ferrall@nwoca.org
parents: 7
diff changeset
48 transformers.add(new ReplaceTransformer("\\}","\\}"));
7
a634b4d554d4 Minor fixups &gt;, random smilies :), etc. Fixed blockquote. Handle escaping brackets outside pre tag.
smith@nwoca.org
parents: 6
diff changeset
49
a634b4d554d4 Minor fixups &gt;, random smilies :), etc. Fixed blockquote. Handle escaping brackets outside pre tag.
smith@nwoca.org
parents: 6
diff changeset
50 transformers.add(new ReplaceTransformer("\\[","\\[")); // Escape brackets
a634b4d554d4 Minor fixups &gt;, random smilies :), etc. Fixed blockquote. Handle escaping brackets outside pre tag.
smith@nwoca.org
parents: 6
diff changeset
51 transformers.add(new ReplaceTransformer("\\]","\\]"));
a634b4d554d4 Minor fixups &gt;, random smilies :), etc. Fixed blockquote. Handle escaping brackets outside pre tag.
smith@nwoca.org
parents: 6
diff changeset
52 transformers.add(new PreTagTransformer()); // Unescape brackets inside <pre>
a634b4d554d4 Minor fixups &gt;, random smilies :), etc. Fixed blockquote. Handle escaping brackets outside pre tag.
smith@nwoca.org
parents: 6
diff changeset
53 //
4
22ed6d93442c Start modifying transformers to Confluence wiki syntax
smith@nwoca.org
parents: 2
diff changeset
54 transformers.add(new ReplaceTransformer("<br>","\\\\"));
8
e8ea26ab2cd7 [no commit message]
ferrall@nwoca.org
parents: 7
diff changeset
55
e8ea26ab2cd7 [no commit message]
ferrall@nwoca.org
parents: 7
diff changeset
56 //replace table tag preserving border setting.
e8ea26ab2cd7 [no commit message]
ferrall@nwoca.org
parents: 7
diff changeset
57 transformers.add(new TagTransformer("<table\\sborder=(\\d).*?>", true, "{table:border=","}"));
e8ea26ab2cd7 [no commit message]
ferrall@nwoca.org
parents: 7
diff changeset
58
4
22ed6d93442c Start modifying transformers to Confluence wiki syntax
smith@nwoca.org
parents: 2
diff changeset
59 transformers.add(new ReplaceTransformer("<table.*?>|</table>","{table}"));
22ed6d93442c Start modifying transformers to Confluence wiki syntax
smith@nwoca.org
parents: 2
diff changeset
60 transformers.add(new ReplaceTransformer("<tr>|</tr>","{tr}"));
5
d34f4d408ef9 [no commit message]
ferrall@nwoca.org
parents: 4
diff changeset
61 transformers.add(new ReplaceTransformer("<td.*?>|</td>","{td}"));
d34f4d408ef9 [no commit message]
ferrall@nwoca.org
parents: 4
diff changeset
62 transformers.add(new ReplaceTransformer("<th.*?>|</th>","{th}"));
4
22ed6d93442c Start modifying transformers to Confluence wiki syntax
smith@nwoca.org
parents: 2
diff changeset
63 transformers.add(new ReplaceTransformer("<ol.*?>|</ol>","{ol}"));
22ed6d93442c Start modifying transformers to Confluence wiki syntax
smith@nwoca.org
parents: 2
diff changeset
64 transformers.add(new ReplaceTransformer("<ul.*?>|</ul>","{ul}"));
22ed6d93442c Start modifying transformers to Confluence wiki syntax
smith@nwoca.org
parents: 2
diff changeset
65 transformers.add(new ReplaceTransformer("<li>","{li}"));
22ed6d93442c Start modifying transformers to Confluence wiki syntax
smith@nwoca.org
parents: 2
diff changeset
66 transformers.add(new ReplaceTransformer("</li>","{li}\n"));
6
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
67
0
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
68 transformers.add(new ChapterTransformer(category));
4
22ed6d93442c Start modifying transformers to Confluence wiki syntax
smith@nwoca.org
parents: 2
diff changeset
69 transformers.add(new TagTransformer("<pre>(.*?)</pre>", true, "{code}","{code}"));
22ed6d93442c Start modifying transformers to Confluence wiki syntax
smith@nwoca.org
parents: 2
diff changeset
70 transformers.add(new TagTransformer("<center>(.*?)</center>", true, "{center}","{center}"));
22ed6d93442c Start modifying transformers to Confluence wiki syntax
smith@nwoca.org
parents: 2
diff changeset
71 transformers.add(new TagTransformer("<em>(.*?)</em>", "*","*"));
22ed6d93442c Start modifying transformers to Confluence wiki syntax
smith@nwoca.org
parents: 2
diff changeset
72 transformers.add(new TagTransformer("<strong>(.*?)</strong>", "*","*"));
9
ccb40d1cb213 [no commit message]
ferrall@nwoca.org
parents: 8
diff changeset
73 transformers.add(new TagTransformer("<u>(.*?)</u>" , "+","+"));
4
22ed6d93442c Start modifying transformers to Confluence wiki syntax
smith@nwoca.org
parents: 2
diff changeset
74 transformers.add(new TagTransformer("(?s)<kbd>(.*?)</kbd>", "{{", "}}"));
22ed6d93442c Start modifying transformers to Confluence wiki syntax
smith@nwoca.org
parents: 2
diff changeset
75 transformers.add(new TagTransformer("<h1>(.*)</h1>", "h1. ", ""));
22ed6d93442c Start modifying transformers to Confluence wiki syntax
smith@nwoca.org
parents: 2
diff changeset
76 transformers.add(new TagTransformer("<h2>(.*)</h2>", "h2. ", ""));
22ed6d93442c Start modifying transformers to Confluence wiki syntax
smith@nwoca.org
parents: 2
diff changeset
77 transformers.add(new TagTransformer("<h3>(accessing the program|sample run|sample screens?|sample reports?)</[h|H]3>","h3.", ""));
22ed6d93442c Start modifying transformers to Confluence wiki syntax
smith@nwoca.org
parents: 2
diff changeset
78 transformers.add(new TagTransformer("<h3>(.*)</H3>", "h3. ", ""));
22ed6d93442c Start modifying transformers to Confluence wiki syntax
smith@nwoca.org
parents: 2
diff changeset
79 transformers.add(new TagTransformer("<h3>(.*)</h3>", "h3. ", ""));
22ed6d93442c Start modifying transformers to Confluence wiki syntax
smith@nwoca.org
parents: 2
diff changeset
80 transformers.add(new TagTransformer("<h4>(.*)</h4>", "h4. ", ""));
22ed6d93442c Start modifying transformers to Confluence wiki syntax
smith@nwoca.org
parents: 2
diff changeset
81 transformers.add(new TagTransformer("<h5>(.*)</h5>", "h5. ", ""));
22ed6d93442c Start modifying transformers to Confluence wiki syntax
smith@nwoca.org
parents: 2
diff changeset
82 transformers.add(new TagTransformer("<h6>(.*)</h6>", "h6. ", ""));
8
e8ea26ab2cd7 [no commit message]
ferrall@nwoca.org
parents: 7
diff changeset
83
e8ea26ab2cd7 [no commit message]
ferrall@nwoca.org
parents: 7
diff changeset
84 //Replace Notes with Info tags.
e8ea26ab2cd7 [no commit message]
ferrall@nwoca.org
parents: 7
diff changeset
85 transformers.add(new ReplaceTransformer("\\{center}\\n\\{table:border=\\d}\\n\\{tr\\}\\n\\s{2}\\{td\\}\\{center\\}\\*Note\\*\\{center\\}","{info}"));
e8ea26ab2cd7 [no commit message]
ferrall@nwoca.org
parents: 7
diff changeset
86 transformers.add(new ReplaceTransformer("\\{td\\}\\n\\s{2}\\{tr\\}\\n\\{table\\}\\n\\{center\\}","{info}"));
5
d34f4d408ef9 [no commit message]
ferrall@nwoca.org
parents: 4
diff changeset
87
8
e8ea26ab2cd7 [no commit message]
ferrall@nwoca.org
parents: 7
diff changeset
88 //Remove unnecessary table surrounding code blocks.
e8ea26ab2cd7 [no commit message]
ferrall@nwoca.org
parents: 7
diff changeset
89 transformers.add(new ReplaceTransformer("\\{table:.*\\}\\n\\s{2}\\{tr\\}\\n\\s{4}\\{td\\}\\n\\s{6}\\n\\{code\\}","{code}"));
e8ea26ab2cd7 [no commit message]
ferrall@nwoca.org
parents: 7
diff changeset
90 transformers.add(new ReplaceTransformer("\\{code\\}\\n\\{td\\}\\{tr\\}\\{table\\}","{code}"));
e8ea26ab2cd7 [no commit message]
ferrall@nwoca.org
parents: 7
diff changeset
91
e8ea26ab2cd7 [no commit message]
ferrall@nwoca.org
parents: 7
diff changeset
92 //Change borderStyle of code window for "screenshots" to none.
e8ea26ab2cd7 [no commit message]
ferrall@nwoca.org
parents: 7
diff changeset
93 transformers.add(new TagTransformer("\\{code\\}([\\s\\n]*?_______________)", true, "{code:borderStyle=none}", ""));
e8ea26ab2cd7 [no commit message]
ferrall@nwoca.org
parents: 7
diff changeset
94
e8ea26ab2cd7 [no commit message]
ferrall@nwoca.org
parents: 7
diff changeset
95
e8ea26ab2cd7 [no commit message]
ferrall@nwoca.org
parents: 7
diff changeset
96
7
a634b4d554d4 Minor fixups &gt;, random smilies :), etc. Fixed blockquote. Handle escaping brackets outside pre tag.
smith@nwoca.org
parents: 6
diff changeset
97 transformers.add(new TagTransformer("<blockquote>(.*?)</blockquote>", true, "{quote}", "{quote}"));
0
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
98 transformers.add(new DeleteTransformer("(?s)<hr.*?>"));
8
e8ea26ab2cd7 [no commit message]
ferrall@nwoca.org
parents: 7
diff changeset
99 transformers.add(new ReflowTransformer("(\\{info\\})([^\\{]*)(\\{info\\})"));
7
a634b4d554d4 Minor fixups &gt;, random smilies :), etc. Fixed blockquote. Handle escaping brackets outside pre tag.
smith@nwoca.org
parents: 6
diff changeset
100 transformers.add(new TagTransformer("<sup>(.*?)</sup>", true, "^\\[","\\]^ "));
a634b4d554d4 Minor fixups &gt;, random smilies :), etc. Fixed blockquote. Handle escaping brackets outside pre tag.
smith@nwoca.org
parents: 6
diff changeset
101 transformers.add(new ReplaceTransformer("&lt;","<"));
a634b4d554d4 Minor fixups &gt;, random smilies :), etc. Fixed blockquote. Handle escaping brackets outside pre tag.
smith@nwoca.org
parents: 6
diff changeset
102 transformers.add(new ReplaceTransformer("&gt;",">"));
a634b4d554d4 Minor fixups &gt;, random smilies :), etc. Fixed blockquote. Handle escaping brackets outside pre tag.
smith@nwoca.org
parents: 6
diff changeset
103 transformers.add(new ReplaceTransformer("&quot;","\""));
a634b4d554d4 Minor fixups &gt;, random smilies :), etc. Fixed blockquote. Handle escaping brackets outside pre tag.
smith@nwoca.org
parents: 6
diff changeset
104 transformers.add(new ReplaceTransformer(":\\)",": )")); // No smilies...
6
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
105
0
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
106 }
6
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
107
0
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
108 /**
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
109 * @param args the command line arguments
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
110 */
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
111 public static void main(String[] args) throws IOException {
6
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
112
0
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
113 if (args.length == 0) {
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
114 System.out.println("Usage:");
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
115 System.out.println(" Html2Wiki {inputDirectory} [Category]");
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
116 System.out.println(" default is current directory");
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
117 System.out.println(" Processes all *.html files. ");
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
118 System.out.println(" Each 'chapter' written to *.wiki");
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
119 return;
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
120 }
6
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
121
0
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
122 File inputs = new File(args[0]);
6
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
123
0
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
124 if (args.length > 1) {
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
125 category = args[1];
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
126 }
6
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
127
0
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
128 File[] inputFiles = inputs.listFiles(new HtmlFileFilter());
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
129 for (int i = 0; i < inputFiles.length; i++) {
6
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
130
0
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
131 process(inputFiles[i]);
6
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
132
0
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
133 }
6
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
134
0
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
135 }
6
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
136
0
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
137 protected static void process(File input) throws IOException {
6
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
138
0
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
139 System.out.println(input.getAbsoluteFile());
6
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
140
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
141 Html2Wiki converter = new Html2Wiki(FileUtils.readFileToString(input, null));
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
142
0
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
143 WikiChapter[] chapters = converter.getWikiChapters();
6
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
144
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
145 System.out.format("Writing %d wiki files...\n", chapters.length);
0
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
146
6
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
147
0
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
148 for (int i = 0; i < chapters.length; i++) {
6
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
149
0
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
150 FileUtils.writeStringToFile(new File(input.getParent(),
6
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
151 generateFilename(chapters[i].getChapterName()) + ".wiki"),
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
152 chapters[i].getContents().toString(),
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
153 null);
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
154
0
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
155 }
6
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
156
0
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
157 }
6
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
158
0
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
159 public static String generateFilename(String input) {
6
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
160 return input.replaceAll("\\\\|/|:|\\(|\\)", "-").replace("<br>", "");
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
161
0
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
162 }
6
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
163
0
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
164 public String getWikiText() {
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
165 convert();
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
166 return buffer.toString();
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
167 }
6
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
168
0
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
169 public WikiChapter[] getWikiChapters() {
6
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
170
0
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
171 convert();
6
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
172
0
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
173 List<WikiChapter> chapters = new ArrayList<WikiChapter>();
6
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
174
0
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
175 Pattern chapterPat = Pattern.compile("<chapter>");
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
176 Matcher begin = chapterPat.matcher(buffer);
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
177 Matcher end = chapterPat.matcher(buffer);
6
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
178
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
179 while (begin.find()) {
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
180
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
181
0
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
182 end.find(begin.end());
6
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
183
0
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
184 Pattern chapterNamePat = Pattern.compile("<chapter>(.*?)</chapter>");
6
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
185
0
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
186 Matcher chapterNameMatcher = chapterNamePat.matcher(buffer);
6
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
187
0
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
188 String chapterName = chapterNameMatcher.find(begin.start()) ? chapterNameMatcher.group(1) : null;
6
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
189
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
190 CharSequence contents = buffer.subSequence(chapterName == null ? begin.start() : chapterNameMatcher.end(), end.hitEnd() ? buffer.length() : end.start());
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
191
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
192 chapters.add(new WikiChapter(chapterName, contents));
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
193
0
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
194 }
6
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
195 return (WikiChapter[]) chapters.toArray(new WikiChapter[]{});
0
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
196 }
6
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
197
0
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
198 private void convert() {
6
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
199
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
200 if (!converted) {
0
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
201 for (Transformer t : transformers) {
6
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
202
0
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
203 System.out.println(".Applying: " + t);
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
204 t.apply(buffer);
6
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
205
0
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
206 }
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
207 }
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
208 converted = true;
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
209 }
6
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
210
0
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
211 private static class HtmlFileFilter implements FileFilter {
6
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
212
0
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
213 public boolean accept(File pathname) {
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
214 return pathname.getName().toLowerCase().matches("^.*\\.html$");
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
215 }
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
216 }
6
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
217
2
5da2e67620f9 Upgrade to Ivy configuration and begin clean up of tests. Added FreeBSD license.
smith@nwoca.org
parents: 0
diff changeset
218 protected static class WikiChapter {
6
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
219
0
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
220 private String chapterName;
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
221 private CharSequence contents;
6
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
222
0
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
223 public WikiChapter(String chapterName, CharSequence contents) {
6
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
224 this.chapterName = chapterName.replaceAll("\\\\|/|:|\\(|\\)", "-").replaceAll("\\s+", " ").replaceAll("&amp;", "and");
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
225
0
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
226 this.contents = contents;
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
227 }
6
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
228
0
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
229 public String getChapterName() {
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
230 return chapterName;
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
231 }
6
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
232
0
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
233 public CharSequence getContents() {
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
234 return contents;
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
235 }
6
99f293bd507f Add "reflow" transformer to reflow paragraphs, list items, etc.
smith@nwoca.org
parents: 5
diff changeset
236
0
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
237 public String toString() {
2
5da2e67620f9 Upgrade to Ivy configuration and begin clean up of tests. Added FreeBSD license.
smith@nwoca.org
parents: 0
diff changeset
238 return "Chapter: " + chapterName + " Content length: " + contents.length();
0
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
239 }
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
240 }
f8b1ea49d065 Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
smith@nwoca.org
parents:
diff changeset
241 }