Mercurial > public > html2wiki
comparison src/org/nwoca/ssdt/tools/html2wiki/Html2Wiki.java @ 0:f8b1ea49d065
Initial version of crude HTML to WikiText converter. Customized for converting HTML files from DEC Document into Wiki markup.
author | smith@nwoca.org |
---|---|
date | Fri, 12 May 2006 16:45:42 -0400 |
parents | |
children | 5da2e67620f9 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:f8b1ea49d065 |
---|---|
1 package org.nwoca.ssdt.tools.html2wiki; | |
2 /* | |
3 * Html2Wiki.java | |
4 * | |
5 * Created on May 9, 2006, 3:22 PM | |
6 * | |
7 */ | |
8 | |
9 import java.io.*; | |
10 import java.util.Collection; | |
11 import java.util.ArrayList; | |
12 import java.util.List; | |
13 import java.util.Iterator; | |
14 import org.apache.commons.io.FileSystemUtils; | |
15 import org.apache.commons.io.FileUtils; | |
16 import java.util.regex.*; | |
17 import org.apache.commons.io.FilenameUtils; | |
18 | |
19 /** | |
20 * Converter to convert HTML documents into MediaWiki test. | |
21 * | |
22 * Heavily customized to handle HTML produced by DEC DOCUMENT | |
23 * SOFTARE doctype. Breaks file into Chapters in the manner done | |
24 * by Document. Needs modification to work with other HTML files. | |
25 * | |
26 * @author SMITH | |
27 */ | |
28 public class Html2Wiki { | |
29 | |
30 private StringBuffer buffer; | |
31 private Collection<Transformer> transformers; | |
32 private boolean converted = false; | |
33 private static String category; | |
34 | |
35 /** Creates a new instance of Html2Wiki. */ | |
36 public Html2Wiki(String html) { | |
37 buffer = new StringBuffer(html); | |
38 transformers = new ArrayList<Transformer>(); | |
39 transformers.add(new PreTagTransformer()); | |
40 transformers.add(new DeleteTransformer("^\\s",true)); | |
41 transformers.add(new DeleteTransformer("<html>|</html>|<body>|</body>")); | |
42 transformers.add(new DeleteTransformer("<!--.*-->(\\n|\\r)*",true)); | |
43 transformers.add(new DeleteTransformer("<a .*?>|</a>")); | |
44 transformers.add(new DeleteTransformer("(?m)^\\*")); | |
45 transformers.add(new DeleteTransformer("<blockquote>|</blockquote>")); | |
46 transformers.add(new DeleteTransformer("<p>")); | |
47 transformers.add(new DeleteTransformer("(?m)<br>$")); | |
48 transformers.add(new DeleteTransformer("<font .*?>|</font>")); | |
49 transformers.add(new CloseTagTransformer("<li>","(\n|\r)*(<li>|</ul>|</ol>|<ul>|<ol>)","\n</li>")); | |
50 transformers.add(new BadTableDataTransformer()); | |
51 transformers.add(new BadTableRowTransformer()); | |
52 transformers.add(new ReplaceTransformer("</td>","\n</td>")); | |
53 transformers.add(new ChapterTransformer(category)); | |
54 transformers.add(new TagTransformer("<em>(.*?)</em>", "''")); | |
55 transformers.add(new TagTransformer("<strong>(.*?)</strong>", "'''")); | |
56 transformers.add(new TagTransformer("(?s)<kbd>(.*?)</kbd>", "<tt>", "</tt>")); | |
57 transformers.add(new TagTransformer("<h1>(.*)</h1>", "== ", " ==")); | |
58 transformers.add(new TagTransformer("<h2>(.*)</h2>", "=== ", " ===")); | |
59 transformers.add(new TagTransformer("<h3>(accessing the program|sample run|sample screens?|sample reports?)</[h|H]3>","=== ", " ===")); | |
60 transformers.add(new TagTransformer("<h3>(.*)</H3>", "", "")); | |
61 transformers.add(new TagTransformer("<h3>(.*)</h3>", "==== ", " ====")); | |
62 transformers.add(new TagTransformer("<h4>(.*)</h4>", "===== ", " =====")); | |
63 transformers.add(new TagTransformer("<h5>(.*)</h5>", "====== ", " ======")); | |
64 transformers.add(new TagTransformer("<h6>(.*)</h6>", "======= ", " =======")); | |
65 transformers.add(new DeleteTransformer("(?s)<hr.*?>")); | |
66 | |
67 } | |
68 | |
69 /** | |
70 * @param args the command line arguments | |
71 */ | |
72 public static void main(String[] args) throws IOException { | |
73 | |
74 if (args.length == 0) { | |
75 System.out.println("Usage:"); | |
76 System.out.println(" Html2Wiki {inputDirectory} [Category]"); | |
77 System.out.println(" default is current directory"); | |
78 System.out.println(" Processes all *.html files. "); | |
79 System.out.println(" Each 'chapter' written to *.wiki"); | |
80 return; | |
81 } | |
82 | |
83 File inputs = new File(args[0]); | |
84 | |
85 if (args.length > 1) { | |
86 category = args[1]; | |
87 } | |
88 | |
89 File[] inputFiles = inputs.listFiles(new HtmlFileFilter()); | |
90 for (int i = 0; i < inputFiles.length; i++) { | |
91 | |
92 process(inputFiles[i]); | |
93 | |
94 } | |
95 | |
96 } | |
97 | |
98 protected static void process(File input) throws IOException { | |
99 | |
100 System.out.println(input.getAbsoluteFile()); | |
101 | |
102 Html2Wiki converter = new Html2Wiki(FileUtils.readFileToString(input,null)); | |
103 | |
104 | |
105 WikiChapter[] chapters = converter.getWikiChapters(); | |
106 | |
107 System.out.format("Writing %d wiki files...\n",chapters.length); | |
108 | |
109 StringBuffer wikiIndex = new StringBuffer(); | |
110 wikiIndex.append("Contents:\n\n"); | |
111 | |
112 for (int i = 0; i < chapters.length; i++) { | |
113 | |
114 wikiIndex.append("# [[" + chapters[i].getChapterName() + "]]\n"); | |
115 FileUtils.writeStringToFile(new File(input.getParent(), | |
116 generateFilename(chapters[i].getChapterName())+".wiki"), | |
117 chapters[i].getContents().toString(), | |
118 null); | |
119 | |
120 } | |
121 System.out.println("Writing wikiIndex..."); | |
122 | |
123 FileUtils.writeStringToFile(new File(FilenameUtils.removeExtension(input.getPath())+".wikiIndex"),wikiIndex.toString(),null); | |
124 } | |
125 | |
126 public static String generateFilename(String input) { | |
127 return input.replaceAll("\\\\|/|:|\\(|\\)","-"); | |
128 | |
129 } | |
130 public String getWikiText() { | |
131 convert(); | |
132 return buffer.toString(); | |
133 } | |
134 | |
135 public WikiChapter[] getWikiChapters() { | |
136 | |
137 convert(); | |
138 | |
139 List<WikiChapter> chapters = new ArrayList<WikiChapter>(); | |
140 | |
141 Pattern chapterPat = Pattern.compile("<chapter>"); | |
142 Matcher begin = chapterPat.matcher(buffer); | |
143 Matcher end = chapterPat.matcher(buffer); | |
144 | |
145 while(begin.find()) { | |
146 | |
147 | |
148 end.find(begin.end()); | |
149 | |
150 Pattern chapterNamePat = Pattern.compile("<chapter>(.*?)</chapter>"); | |
151 | |
152 Matcher chapterNameMatcher = chapterNamePat.matcher(buffer); | |
153 | |
154 String chapterName = chapterNameMatcher.find(begin.start()) ? chapterNameMatcher.group(1) : null; | |
155 | |
156 CharSequence contents = buffer.subSequence(chapterName == null ? begin.start() : chapterNameMatcher.end() | |
157 ,end.hitEnd() ? buffer.length() : end.start()); | |
158 | |
159 chapters.add(new WikiChapter(chapterName,contents)); | |
160 | |
161 } | |
162 return (WikiChapter[])chapters.toArray(new WikiChapter[]{}); | |
163 } | |
164 | |
165 private void convert() { | |
166 | |
167 if(!converted) { | |
168 for (Transformer t : transformers) { | |
169 | |
170 System.out.println(".Applying: " + t); | |
171 t.apply(buffer); | |
172 | |
173 } | |
174 } | |
175 converted = true; | |
176 } | |
177 | |
178 private static class HtmlFileFilter implements FileFilter { | |
179 public boolean accept(File pathname) { | |
180 return pathname.getName().toLowerCase().matches("^.*\\.html$"); | |
181 } | |
182 | |
183 } | |
184 private static class WikiChapter { | |
185 private String chapterName; | |
186 private CharSequence contents; | |
187 | |
188 public WikiChapter(String chapterName, CharSequence contents) { | |
189 this.chapterName = chapterName.replaceAll("\\\\|/|:|\\(|\\)","-").replaceAll("\\s+"," ").replaceAll("&","and"); | |
190 | |
191 this.contents = contents; | |
192 } | |
193 | |
194 public String getChapterName() { | |
195 return chapterName; | |
196 } | |
197 | |
198 public CharSequence getContents() { | |
199 return contents; | |
200 } | |
201 | |
202 public String toString() { | |
203 return "Chapter: " + chapterName + "\nContents: " + contents; | |
204 } | |
205 } | |
206 | |
207 } |