Mercurial > public > html2wiki
comparison src/org/nwoca/ssdt/tools/html2wiki/Html2Wiki.java @ 6:99f293bd507f
Add "reflow" transformer to reflow paragraphs, list items, etc.
author | smith@nwoca.org |
---|---|
date | Thu, 27 Jan 2011 16:37:27 -0500 |
parents | d34f4d408ef9 |
children | a634b4d554d4 |
comparison
equal
deleted
inserted
replaced
5:d34f4d408ef9 | 6:99f293bd507f |
---|---|
8 | 8 |
9 import java.io.*; | 9 import java.io.*; |
10 import java.util.Collection; | 10 import java.util.Collection; |
11 import java.util.ArrayList; | 11 import java.util.ArrayList; |
12 import java.util.List; | 12 import java.util.List; |
13 import java.util.Iterator; | |
14 import org.apache.commons.io.FileSystemUtils; | |
15 import org.apache.commons.io.FileUtils; | 13 import org.apache.commons.io.FileUtils; |
16 import java.util.regex.*; | 14 import java.util.regex.*; |
17 import org.apache.commons.io.FilenameUtils; | 15 import org.apache.commons.io.FilenameUtils; |
18 | 16 |
19 /** | 17 /** |
24 * by Document. Needs modification to work with other HTML files. | 22 * by Document. Needs modification to work with other HTML files. |
25 * | 23 * |
26 * @author SMITH | 24 * @author SMITH |
27 */ | 25 */ |
28 public class Html2Wiki { | 26 public class Html2Wiki { |
29 | 27 |
30 private StringBuffer buffer; | 28 private StringBuffer buffer; |
31 private Collection<Transformer> transformers; | 29 private Collection<Transformer> transformers; |
32 private boolean converted = false; | 30 private boolean converted = false; |
33 private static String category; | 31 private static String category; |
34 | 32 |
35 /** Creates a new instance of Html2Wiki. */ | 33 /** Creates a new instance of Html2Wiki. */ |
36 public Html2Wiki(String html) { | 34 public Html2Wiki(String html) { |
37 buffer = new StringBuffer(html); | 35 buffer = new StringBuffer(html); |
38 transformers = new ArrayList<Transformer>(); | 36 transformers = new ArrayList<Transformer>(); |
39 // transformers.add(new PreTagTransformer()); | 37 // transformers.add(new PreTagTransformer()); |
41 transformers.add(new DeleteTransformer("<html>|</html>|<body>|</body>")); | 39 transformers.add(new DeleteTransformer("<html>|</html>|<body>|</body>")); |
42 transformers.add(new DeleteTransformer("<!--.*-->(\\n|\\r)*",true)); | 40 transformers.add(new DeleteTransformer("<!--.*-->(\\n|\\r)*",true)); |
43 transformers.add(new DeleteTransformer("<a .*?>|</a>")); | 41 transformers.add(new DeleteTransformer("<a .*?>|</a>")); |
44 transformers.add(new DeleteTransformer("(?m)^\\*")); | 42 transformers.add(new DeleteTransformer("(?m)^\\*")); |
45 // transformers.add(new DeleteTransformer("<blockquote>|</blockquote>")); | 43 // transformers.add(new DeleteTransformer("<blockquote>|</blockquote>")); |
46 transformers.add(new DeleteTransformer("<p>")); | |
47 transformers.add(new DeleteTransformer("(?m)<br>$")); | 44 transformers.add(new DeleteTransformer("(?m)<br>$")); |
48 transformers.add(new DeleteTransformer("<font .*?>|</font>")); | 45 transformers.add(new DeleteTransformer("<font .*?>|</font>")); |
49 transformers.add(new CloseTagTransformer("<li>","(\n|\r)*(<li>|</ul>|</ol>|<ul>|<ol>)","</li>")); | 46 transformers.add(new CloseTagTransformer("<li>","(\n|\r)*(<li>|</ul>|</ol>|<ul>|<ol>)","</li>")); |
50 transformers.add(new BadTableDataTransformer()); | 47 transformers.add(new BadTableDataTransformer()); |
51 transformers.add(new BadTableRowTransformer()); | 48 transformers.add(new BadTableRowTransformer()); |
49 transformers.add(new ReflowTransformer()); | |
50 transformers.add(new DeleteTransformer("<p>")); | |
52 // transformers.add(new ReplaceTransformer("</td>","\n</td>")); | 51 // transformers.add(new ReplaceTransformer("</td>","\n</td>")); |
53 transformers.add(new ReplaceTransformer("\\{","\\{")); | 52 transformers.add(new ReplaceTransformer("\\{","\\{")); |
54 transformers.add(new ReplaceTransformer("\\}","\\}")); | 53 transformers.add(new ReplaceTransformer("\\}","\\}")); |
55 // transformers.add(new ReplaceTransformer("\\[","\\[")); | 54 // transformers.add(new ReplaceTransformer("\\[","\\[")); |
56 // transformers.add(new ReplaceTransformer("\\]","\\]")); | 55 // transformers.add(new ReplaceTransformer("\\]","\\]")); |
61 transformers.add(new ReplaceTransformer("<th.*?>|</th>","{th}")); | 60 transformers.add(new ReplaceTransformer("<th.*?>|</th>","{th}")); |
62 transformers.add(new ReplaceTransformer("<ol.*?>|</ol>","{ol}")); | 61 transformers.add(new ReplaceTransformer("<ol.*?>|</ol>","{ol}")); |
63 transformers.add(new ReplaceTransformer("<ul.*?>|</ul>","{ul}")); | 62 transformers.add(new ReplaceTransformer("<ul.*?>|</ul>","{ul}")); |
64 transformers.add(new ReplaceTransformer("<li>","{li}")); | 63 transformers.add(new ReplaceTransformer("<li>","{li}")); |
65 transformers.add(new ReplaceTransformer("</li>","{li}\n")); | 64 transformers.add(new ReplaceTransformer("</li>","{li}\n")); |
66 | 65 |
67 transformers.add(new ChapterTransformer(category)); | 66 transformers.add(new ChapterTransformer(category)); |
68 transformers.add(new TagTransformer("<pre>(.*?)</pre>", true, "{code}","{code}")); | 67 transformers.add(new TagTransformer("<pre>(.*?)</pre>", true, "{code}","{code}")); |
69 transformers.add(new TagTransformer("<center>(.*?)</center>", true, "{center}","{center}")); | 68 transformers.add(new TagTransformer("<center>(.*?)</center>", true, "{center}","{center}")); |
70 transformers.add(new TagTransformer("<em>(.*?)</em>", "*","*")); | 69 transformers.add(new TagTransformer("<em>(.*?)</em>", "*","*")); |
71 transformers.add(new TagTransformer("<strong>(.*?)</strong>", "*","*")); | 70 transformers.add(new TagTransformer("<strong>(.*?)</strong>", "*","*")); |
83 | 82 |
84 // transformers.add(new TagTransformer("\\{center}\\n\\{table}\\n\\{tr\\}\\n\\s{2}\\{td\\}\\{center\\}\\*Note\\*\\{center\\}(.*?)\\s\\{td\\}\\n\\s{2}\\{tr\\}\\{table\\}", "{note}", "{note}")); | 83 // transformers.add(new TagTransformer("\\{center}\\n\\{table}\\n\\{tr\\}\\n\\s{2}\\{td\\}\\{center\\}\\*Note\\*\\{center\\}(.*?)\\s\\{td\\}\\n\\s{2}\\{tr\\}\\{table\\}", "{note}", "{note}")); |
85 // transformers.add(new TagTransformer("(\\S)\\s\\n", "", " ")); | 84 // transformers.add(new TagTransformer("(\\S)\\s\\n", "", " ")); |
86 transformers.add(new TagTransformer("<blockquote>(.*)</blockquote>", "{quote}", "{quote}")); | 85 transformers.add(new TagTransformer("<blockquote>(.*)</blockquote>", "{quote}", "{quote}")); |
87 transformers.add(new DeleteTransformer("(?s)<hr.*?>")); | 86 transformers.add(new DeleteTransformer("(?s)<hr.*?>")); |
88 | 87 transformers.add(new ReflowTransformer("(\\{note\\})([^\\{]*)(\\{note\\})")); |
89 } | 88 |
90 | 89 } |
90 | |
91 /** | 91 /** |
92 * @param args the command line arguments | 92 * @param args the command line arguments |
93 */ | 93 */ |
94 public static void main(String[] args) throws IOException { | 94 public static void main(String[] args) throws IOException { |
95 | 95 |
96 if (args.length == 0) { | 96 if (args.length == 0) { |
97 System.out.println("Usage:"); | 97 System.out.println("Usage:"); |
98 System.out.println(" Html2Wiki {inputDirectory} [Category]"); | 98 System.out.println(" Html2Wiki {inputDirectory} [Category]"); |
99 System.out.println(" default is current directory"); | 99 System.out.println(" default is current directory"); |
100 System.out.println(" Processes all *.html files. "); | 100 System.out.println(" Processes all *.html files. "); |
101 System.out.println(" Each 'chapter' written to *.wiki"); | 101 System.out.println(" Each 'chapter' written to *.wiki"); |
102 return; | 102 return; |
103 } | 103 } |
104 | 104 |
105 File inputs = new File(args[0]); | 105 File inputs = new File(args[0]); |
106 | 106 |
107 if (args.length > 1) { | 107 if (args.length > 1) { |
108 category = args[1]; | 108 category = args[1]; |
109 } | 109 } |
110 | 110 |
111 File[] inputFiles = inputs.listFiles(new HtmlFileFilter()); | 111 File[] inputFiles = inputs.listFiles(new HtmlFileFilter()); |
112 for (int i = 0; i < inputFiles.length; i++) { | 112 for (int i = 0; i < inputFiles.length; i++) { |
113 | 113 |
114 process(inputFiles[i]); | 114 process(inputFiles[i]); |
115 | 115 |
116 } | 116 } |
117 | 117 |
118 } | 118 } |
119 | 119 |
120 protected static void process(File input) throws IOException { | 120 protected static void process(File input) throws IOException { |
121 | 121 |
122 System.out.println(input.getAbsoluteFile()); | 122 System.out.println(input.getAbsoluteFile()); |
123 | 123 |
124 Html2Wiki converter = new Html2Wiki(FileUtils.readFileToString(input,null)); | 124 Html2Wiki converter = new Html2Wiki(FileUtils.readFileToString(input, null)); |
125 | 125 |
126 WikiChapter[] chapters = converter.getWikiChapters(); | 126 WikiChapter[] chapters = converter.getWikiChapters(); |
127 | 127 |
128 System.out.format("Writing %d wiki files...\n",chapters.length); | 128 System.out.format("Writing %d wiki files...\n", chapters.length); |
129 | 129 |
130 StringBuffer wikiIndex = new StringBuffer(); | 130 |
131 wikiIndex.append("Contents:\n\n"); | |
132 | |
133 for (int i = 0; i < chapters.length; i++) { | 131 for (int i = 0; i < chapters.length; i++) { |
134 | 132 |
135 wikiIndex.append("# [[" + chapters[i].getChapterName() + "]]\n"); | |
136 FileUtils.writeStringToFile(new File(input.getParent(), | 133 FileUtils.writeStringToFile(new File(input.getParent(), |
137 generateFilename(chapters[i].getChapterName())+".wiki"), | 134 generateFilename(chapters[i].getChapterName()) + ".wiki"), |
138 chapters[i].getContents().toString(), | 135 chapters[i].getContents().toString(), |
139 null); | 136 null); |
140 | 137 |
141 } | 138 } |
142 System.out.println("Writing wikiIndex..."); | 139 |
143 | 140 } |
144 FileUtils.writeStringToFile(new File(FilenameUtils.removeExtension(input.getPath())+".wikiIndex"),wikiIndex.toString(),null); | 141 |
145 } | |
146 | |
147 public static String generateFilename(String input) { | 142 public static String generateFilename(String input) { |
148 return input.replaceAll("\\\\|/|:|\\(|\\)","-").replace("<br>", ""); | 143 return input.replaceAll("\\\\|/|:|\\(|\\)", "-").replace("<br>", ""); |
149 | 144 |
150 } | 145 } |
146 | |
151 public String getWikiText() { | 147 public String getWikiText() { |
152 convert(); | 148 convert(); |
153 return buffer.toString(); | 149 return buffer.toString(); |
154 } | 150 } |
155 | 151 |
156 public WikiChapter[] getWikiChapters() { | 152 public WikiChapter[] getWikiChapters() { |
157 | 153 |
158 convert(); | 154 convert(); |
159 | 155 |
160 List<WikiChapter> chapters = new ArrayList<WikiChapter>(); | 156 List<WikiChapter> chapters = new ArrayList<WikiChapter>(); |
161 | 157 |
162 Pattern chapterPat = Pattern.compile("<chapter>"); | 158 Pattern chapterPat = Pattern.compile("<chapter>"); |
163 Matcher begin = chapterPat.matcher(buffer); | 159 Matcher begin = chapterPat.matcher(buffer); |
164 Matcher end = chapterPat.matcher(buffer); | 160 Matcher end = chapterPat.matcher(buffer); |
165 | 161 |
166 while(begin.find()) { | 162 while (begin.find()) { |
167 | 163 |
168 | 164 |
169 end.find(begin.end()); | 165 end.find(begin.end()); |
170 | 166 |
171 Pattern chapterNamePat = Pattern.compile("<chapter>(.*?)</chapter>"); | 167 Pattern chapterNamePat = Pattern.compile("<chapter>(.*?)</chapter>"); |
172 | 168 |
173 Matcher chapterNameMatcher = chapterNamePat.matcher(buffer); | 169 Matcher chapterNameMatcher = chapterNamePat.matcher(buffer); |
174 | 170 |
175 String chapterName = chapterNameMatcher.find(begin.start()) ? chapterNameMatcher.group(1) : null; | 171 String chapterName = chapterNameMatcher.find(begin.start()) ? chapterNameMatcher.group(1) : null; |
176 | 172 |
177 CharSequence contents = buffer.subSequence(chapterName == null ? begin.start() : chapterNameMatcher.end() | 173 CharSequence contents = buffer.subSequence(chapterName == null ? begin.start() : chapterNameMatcher.end(), end.hitEnd() ? buffer.length() : end.start()); |
178 ,end.hitEnd() ? buffer.length() : end.start()); | 174 |
179 | 175 chapters.add(new WikiChapter(chapterName, contents)); |
180 chapters.add(new WikiChapter(chapterName,contents)); | 176 |
181 | 177 } |
182 } | 178 return (WikiChapter[]) chapters.toArray(new WikiChapter[]{}); |
183 return (WikiChapter[])chapters.toArray(new WikiChapter[]{}); | 179 } |
184 } | 180 |
185 | |
186 private void convert() { | 181 private void convert() { |
187 | 182 |
188 if(!converted) { | 183 if (!converted) { |
189 for (Transformer t : transformers) { | 184 for (Transformer t : transformers) { |
190 | 185 |
191 System.out.println(".Applying: " + t); | 186 System.out.println(".Applying: " + t); |
192 t.apply(buffer); | 187 t.apply(buffer); |
193 | 188 |
194 } | 189 } |
195 } | 190 } |
196 converted = true; | 191 converted = true; |
197 } | 192 } |
198 | 193 |
199 private static class HtmlFileFilter implements FileFilter { | 194 private static class HtmlFileFilter implements FileFilter { |
195 | |
200 public boolean accept(File pathname) { | 196 public boolean accept(File pathname) { |
201 return pathname.getName().toLowerCase().matches("^.*\\.html$"); | 197 return pathname.getName().toLowerCase().matches("^.*\\.html$"); |
202 } | 198 } |
203 | 199 } |
204 } | 200 |
205 protected static class WikiChapter { | 201 protected static class WikiChapter { |
202 | |
206 private String chapterName; | 203 private String chapterName; |
207 private CharSequence contents; | 204 private CharSequence contents; |
208 | 205 |
209 public WikiChapter(String chapterName, CharSequence contents) { | 206 public WikiChapter(String chapterName, CharSequence contents) { |
210 this.chapterName = chapterName.replaceAll("\\\\|/|:|\\(|\\)","-").replaceAll("\\s+"," ").replaceAll("&","and"); | 207 this.chapterName = chapterName.replaceAll("\\\\|/|:|\\(|\\)", "-").replaceAll("\\s+", " ").replaceAll("&", "and"); |
211 | 208 |
212 this.contents = contents; | 209 this.contents = contents; |
213 } | 210 } |
214 | 211 |
215 public String getChapterName() { | 212 public String getChapterName() { |
216 return chapterName; | 213 return chapterName; |
217 } | 214 } |
218 | 215 |
219 public CharSequence getContents() { | 216 public CharSequence getContents() { |
220 return contents; | 217 return contents; |
221 } | 218 } |
222 | 219 |
223 public String toString() { | 220 public String toString() { |
224 return "Chapter: " + chapterName + " Content length: " + contents.length(); | 221 return "Chapter: " + chapterName + " Content length: " + contents.length(); |
225 } | 222 } |
226 } | 223 } |
227 | |
228 } | 224 } |