comparison src/org/nwoca/ssdt/tools/html2wiki/Html2Wiki.java @ 0:f8b1ea49d065

Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
author smith@nwoca.org
date Fri, 12 May 2006 16:45:42 -0400
parents
children 5da2e67620f9
comparison
equal deleted inserted replaced
-1:000000000000 0:f8b1ea49d065
1 package org.nwoca.ssdt.tools.html2wiki;
2 /*
3 * Html2Wiki.java
4 *
5 * Created on May 9, 2006, 3:22 PM
6 *
7 */
8
9 import java.io.*;
10 import java.util.Collection;
11 import java.util.ArrayList;
12 import java.util.List;
13 import java.util.Iterator;
14 import org.apache.commons.io.FileSystemUtils;
15 import org.apache.commons.io.FileUtils;
16 import java.util.regex.*;
17 import org.apache.commons.io.FilenameUtils;
18
19 /**
20 * Converter to convert HTML documents into MediaWiki test.
21 *
22 * Heavily customized to handle HTML produced by DEC DOCUMENT
23 * SOFTARE doctype. Breaks file into Chapters in the manner done
24 * by Document. Needs modification to work with other HTML files.
25 *
26 * @author SMITH
27 */
28 public class Html2Wiki {
29
30 private StringBuffer buffer;
31 private Collection<Transformer> transformers;
32 private boolean converted = false;
33 private static String category;
34
35 /** Creates a new instance of Html2Wiki. */
36 public Html2Wiki(String html) {
37 buffer = new StringBuffer(html);
38 transformers = new ArrayList<Transformer>();
39 transformers.add(new PreTagTransformer());
40 transformers.add(new DeleteTransformer("^\\s",true));
41 transformers.add(new DeleteTransformer("<html>|</html>|<body>|</body>"));
42 transformers.add(new DeleteTransformer("<!--.*-->(\\n|\\r)*",true));
43 transformers.add(new DeleteTransformer("<a .*?>|</a>"));
44 transformers.add(new DeleteTransformer("(?m)^\\*"));
45 transformers.add(new DeleteTransformer("<blockquote>|</blockquote>"));
46 transformers.add(new DeleteTransformer("<p>"));
47 transformers.add(new DeleteTransformer("(?m)<br>$"));
48 transformers.add(new DeleteTransformer("<font .*?>|</font>"));
49 transformers.add(new CloseTagTransformer("<li>","(\n|\r)*(<li>|</ul>|</ol>|<ul>|<ol>)","\n</li>"));
50 transformers.add(new BadTableDataTransformer());
51 transformers.add(new BadTableRowTransformer());
52 transformers.add(new ReplaceTransformer("</td>","\n</td>"));
53 transformers.add(new ChapterTransformer(category));
54 transformers.add(new TagTransformer("<em>(.*?)</em>", "''"));
55 transformers.add(new TagTransformer("<strong>(.*?)</strong>", "'''"));
56 transformers.add(new TagTransformer("(?s)<kbd>(.*?)</kbd>", "<tt>", "</tt>"));
57 transformers.add(new TagTransformer("<h1>(.*)</h1>", "== ", " =="));
58 transformers.add(new TagTransformer("<h2>(.*)</h2>", "=== ", " ==="));
59 transformers.add(new TagTransformer("<h3>(accessing the program|sample run|sample screens?|sample reports?)</[h|H]3>","=== ", " ==="));
60 transformers.add(new TagTransformer("<h3>(.*)</H3>", "", ""));
61 transformers.add(new TagTransformer("<h3>(.*)</h3>", "==== ", " ===="));
62 transformers.add(new TagTransformer("<h4>(.*)</h4>", "===== ", " ====="));
63 transformers.add(new TagTransformer("<h5>(.*)</h5>", "====== ", " ======"));
64 transformers.add(new TagTransformer("<h6>(.*)</h6>", "======= ", " ======="));
65 transformers.add(new DeleteTransformer("(?s)<hr.*?>"));
66
67 }
68
69 /**
70 * @param args the command line arguments
71 */
72 public static void main(String[] args) throws IOException {
73
74 if (args.length == 0) {
75 System.out.println("Usage:");
76 System.out.println(" Html2Wiki {inputDirectory} [Category]");
77 System.out.println(" default is current directory");
78 System.out.println(" Processes all *.html files. ");
79 System.out.println(" Each 'chapter' written to *.wiki");
80 return;
81 }
82
83 File inputs = new File(args[0]);
84
85 if (args.length > 1) {
86 category = args[1];
87 }
88
89 File[] inputFiles = inputs.listFiles(new HtmlFileFilter());
90 for (int i = 0; i < inputFiles.length; i++) {
91
92 process(inputFiles[i]);
93
94 }
95
96 }
97
98 protected static void process(File input) throws IOException {
99
100 System.out.println(input.getAbsoluteFile());
101
102 Html2Wiki converter = new Html2Wiki(FileUtils.readFileToString(input,null));
103
104
105 WikiChapter[] chapters = converter.getWikiChapters();
106
107 System.out.format("Writing %d wiki files...\n",chapters.length);
108
109 StringBuffer wikiIndex = new StringBuffer();
110 wikiIndex.append("Contents:\n\n");
111
112 for (int i = 0; i < chapters.length; i++) {
113
114 wikiIndex.append("# [[" + chapters[i].getChapterName() + "]]\n");
115 FileUtils.writeStringToFile(new File(input.getParent(),
116 generateFilename(chapters[i].getChapterName())+".wiki"),
117 chapters[i].getContents().toString(),
118 null);
119
120 }
121 System.out.println("Writing wikiIndex...");
122
123 FileUtils.writeStringToFile(new File(FilenameUtils.removeExtension(input.getPath())+".wikiIndex"),wikiIndex.toString(),null);
124 }
125
126 public static String generateFilename(String input) {
127 return input.replaceAll("\\\\|/|:|\\(|\\)","-");
128
129 }
130 public String getWikiText() {
131 convert();
132 return buffer.toString();
133 }
134
135 public WikiChapter[] getWikiChapters() {
136
137 convert();
138
139 List<WikiChapter> chapters = new ArrayList<WikiChapter>();
140
141 Pattern chapterPat = Pattern.compile("<chapter>");
142 Matcher begin = chapterPat.matcher(buffer);
143 Matcher end = chapterPat.matcher(buffer);
144
145 while(begin.find()) {
146
147
148 end.find(begin.end());
149
150 Pattern chapterNamePat = Pattern.compile("<chapter>(.*?)</chapter>");
151
152 Matcher chapterNameMatcher = chapterNamePat.matcher(buffer);
153
154 String chapterName = chapterNameMatcher.find(begin.start()) ? chapterNameMatcher.group(1) : null;
155
156 CharSequence contents = buffer.subSequence(chapterName == null ? begin.start() : chapterNameMatcher.end()
157 ,end.hitEnd() ? buffer.length() : end.start());
158
159 chapters.add(new WikiChapter(chapterName,contents));
160
161 }
162 return (WikiChapter[])chapters.toArray(new WikiChapter[]{});
163 }
164
165 private void convert() {
166
167 if(!converted) {
168 for (Transformer t : transformers) {
169
170 System.out.println(".Applying: " + t);
171 t.apply(buffer);
172
173 }
174 }
175 converted = true;
176 }
177
178 private static class HtmlFileFilter implements FileFilter {
179 public boolean accept(File pathname) {
180 return pathname.getName().toLowerCase().matches("^.*\\.html$");
181 }
182
183 }
184 private static class WikiChapter {
185 private String chapterName;
186 private CharSequence contents;
187
188 public WikiChapter(String chapterName, CharSequence contents) {
189 this.chapterName = chapterName.replaceAll("\\\\|/|:|\\(|\\)","-").replaceAll("\\s+"," ").replaceAll("&amp;","and");
190
191 this.contents = contents;
192 }
193
194 public String getChapterName() {
195 return chapterName;
196 }
197
198 public CharSequence getContents() {
199 return contents;
200 }
201
202 public String toString() {
203 return "Chapter: " + chapterName + "\nContents: " + contents;
204 }
205 }
206
207 }