comparison src/org/nwoca/ssdt/tools/html2wiki/Html2Wiki.java @ 6:99f293bd507f

Add "reflow" transformer to reflow paragraphs, list items, etc.
author smith@nwoca.org
date Thu, 27 Jan 2011 16:37:27 -0500
parents d34f4d408ef9
children a634b4d554d4
comparison
equal deleted inserted replaced
5:d34f4d408ef9 6:99f293bd507f
8 8
9 import java.io.*; 9 import java.io.*;
10 import java.util.Collection; 10 import java.util.Collection;
11 import java.util.ArrayList; 11 import java.util.ArrayList;
12 import java.util.List; 12 import java.util.List;
13 import java.util.Iterator;
14 import org.apache.commons.io.FileSystemUtils;
15 import org.apache.commons.io.FileUtils; 13 import org.apache.commons.io.FileUtils;
16 import java.util.regex.*; 14 import java.util.regex.*;
17 import org.apache.commons.io.FilenameUtils; 15 import org.apache.commons.io.FilenameUtils;
18 16
19 /** 17 /**
24 * by Document. Needs modification to work with other HTML files. 22 * by Document. Needs modification to work with other HTML files.
25 * 23 *
26 * @author SMITH 24 * @author SMITH
27 */ 25 */
28 public class Html2Wiki { 26 public class Html2Wiki {
29 27
30 private StringBuffer buffer; 28 private StringBuffer buffer;
31 private Collection<Transformer> transformers; 29 private Collection<Transformer> transformers;
32 private boolean converted = false; 30 private boolean converted = false;
33 private static String category; 31 private static String category;
34 32
35 /** Creates a new instance of Html2Wiki. */ 33 /** Creates a new instance of Html2Wiki. */
36 public Html2Wiki(String html) { 34 public Html2Wiki(String html) {
37 buffer = new StringBuffer(html); 35 buffer = new StringBuffer(html);
38 transformers = new ArrayList<Transformer>(); 36 transformers = new ArrayList<Transformer>();
39 // transformers.add(new PreTagTransformer()); 37 // transformers.add(new PreTagTransformer());
41 transformers.add(new DeleteTransformer("<html>|</html>|<body>|</body>")); 39 transformers.add(new DeleteTransformer("<html>|</html>|<body>|</body>"));
42 transformers.add(new DeleteTransformer("<!--.*-->(\\n|\\r)*",true)); 40 transformers.add(new DeleteTransformer("<!--.*-->(\\n|\\r)*",true));
43 transformers.add(new DeleteTransformer("<a .*?>|</a>")); 41 transformers.add(new DeleteTransformer("<a .*?>|</a>"));
44 transformers.add(new DeleteTransformer("(?m)^\\*")); 42 transformers.add(new DeleteTransformer("(?m)^\\*"));
45 // transformers.add(new DeleteTransformer("<blockquote>|</blockquote>")); 43 // transformers.add(new DeleteTransformer("<blockquote>|</blockquote>"));
46 transformers.add(new DeleteTransformer("<p>"));
47 transformers.add(new DeleteTransformer("(?m)<br>$")); 44 transformers.add(new DeleteTransformer("(?m)<br>$"));
48 transformers.add(new DeleteTransformer("<font .*?>|</font>")); 45 transformers.add(new DeleteTransformer("<font .*?>|</font>"));
49 transformers.add(new CloseTagTransformer("<li>","(\n|\r)*(<li>|</ul>|</ol>|<ul>|<ol>)","</li>")); 46 transformers.add(new CloseTagTransformer("<li>","(\n|\r)*(<li>|</ul>|</ol>|<ul>|<ol>)","</li>"));
50 transformers.add(new BadTableDataTransformer()); 47 transformers.add(new BadTableDataTransformer());
51 transformers.add(new BadTableRowTransformer()); 48 transformers.add(new BadTableRowTransformer());
49 transformers.add(new ReflowTransformer());
50 transformers.add(new DeleteTransformer("<p>"));
52 // transformers.add(new ReplaceTransformer("</td>","\n</td>")); 51 // transformers.add(new ReplaceTransformer("</td>","\n</td>"));
53 transformers.add(new ReplaceTransformer("\\{","\\{")); 52 transformers.add(new ReplaceTransformer("\\{","\\{"));
54 transformers.add(new ReplaceTransformer("\\}","\\}")); 53 transformers.add(new ReplaceTransformer("\\}","\\}"));
55 // transformers.add(new ReplaceTransformer("\\[","\\[")); 54 // transformers.add(new ReplaceTransformer("\\[","\\["));
56 // transformers.add(new ReplaceTransformer("\\]","\\]")); 55 // transformers.add(new ReplaceTransformer("\\]","\\]"));
61 transformers.add(new ReplaceTransformer("<th.*?>|</th>","{th}")); 60 transformers.add(new ReplaceTransformer("<th.*?>|</th>","{th}"));
62 transformers.add(new ReplaceTransformer("<ol.*?>|</ol>","{ol}")); 61 transformers.add(new ReplaceTransformer("<ol.*?>|</ol>","{ol}"));
63 transformers.add(new ReplaceTransformer("<ul.*?>|</ul>","{ul}")); 62 transformers.add(new ReplaceTransformer("<ul.*?>|</ul>","{ul}"));
64 transformers.add(new ReplaceTransformer("<li>","{li}")); 63 transformers.add(new ReplaceTransformer("<li>","{li}"));
65 transformers.add(new ReplaceTransformer("</li>","{li}\n")); 64 transformers.add(new ReplaceTransformer("</li>","{li}\n"));
66 65
67 transformers.add(new ChapterTransformer(category)); 66 transformers.add(new ChapterTransformer(category));
68 transformers.add(new TagTransformer("<pre>(.*?)</pre>", true, "{code}","{code}")); 67 transformers.add(new TagTransformer("<pre>(.*?)</pre>", true, "{code}","{code}"));
69 transformers.add(new TagTransformer("<center>(.*?)</center>", true, "{center}","{center}")); 68 transformers.add(new TagTransformer("<center>(.*?)</center>", true, "{center}","{center}"));
70 transformers.add(new TagTransformer("<em>(.*?)</em>", "*","*")); 69 transformers.add(new TagTransformer("<em>(.*?)</em>", "*","*"));
71 transformers.add(new TagTransformer("<strong>(.*?)</strong>", "*","*")); 70 transformers.add(new TagTransformer("<strong>(.*?)</strong>", "*","*"));
83 82
84 // transformers.add(new TagTransformer("\\{center}\\n\\{table}\\n\\{tr\\}\\n\\s{2}\\{td\\}\\{center\\}\\*Note\\*\\{center\\}(.*?)\\s\\{td\\}\\n\\s{2}\\{tr\\}\\{table\\}", "{note}", "{note}")); 83 // transformers.add(new TagTransformer("\\{center}\\n\\{table}\\n\\{tr\\}\\n\\s{2}\\{td\\}\\{center\\}\\*Note\\*\\{center\\}(.*?)\\s\\{td\\}\\n\\s{2}\\{tr\\}\\{table\\}", "{note}", "{note}"));
85 // transformers.add(new TagTransformer("(\\S)\\s\\n", "", " ")); 84 // transformers.add(new TagTransformer("(\\S)\\s\\n", "", " "));
86 transformers.add(new TagTransformer("<blockquote>(.*)</blockquote>", "{quote}", "{quote}")); 85 transformers.add(new TagTransformer("<blockquote>(.*)</blockquote>", "{quote}", "{quote}"));
87 transformers.add(new DeleteTransformer("(?s)<hr.*?>")); 86 transformers.add(new DeleteTransformer("(?s)<hr.*?>"));
88 87 transformers.add(new ReflowTransformer("(\\{note\\})([^\\{]*)(\\{note\\})"));
89 } 88
90 89 }
90
91 /** 91 /**
92 * @param args the command line arguments 92 * @param args the command line arguments
93 */ 93 */
94 public static void main(String[] args) throws IOException { 94 public static void main(String[] args) throws IOException {
95 95
96 if (args.length == 0) { 96 if (args.length == 0) {
97 System.out.println("Usage:"); 97 System.out.println("Usage:");
98 System.out.println(" Html2Wiki {inputDirectory} [Category]"); 98 System.out.println(" Html2Wiki {inputDirectory} [Category]");
99 System.out.println(" default is current directory"); 99 System.out.println(" default is current directory");
100 System.out.println(" Processes all *.html files. "); 100 System.out.println(" Processes all *.html files. ");
101 System.out.println(" Each 'chapter' written to *.wiki"); 101 System.out.println(" Each 'chapter' written to *.wiki");
102 return; 102 return;
103 } 103 }
104 104
105 File inputs = new File(args[0]); 105 File inputs = new File(args[0]);
106 106
107 if (args.length > 1) { 107 if (args.length > 1) {
108 category = args[1]; 108 category = args[1];
109 } 109 }
110 110
111 File[] inputFiles = inputs.listFiles(new HtmlFileFilter()); 111 File[] inputFiles = inputs.listFiles(new HtmlFileFilter());
112 for (int i = 0; i < inputFiles.length; i++) { 112 for (int i = 0; i < inputFiles.length; i++) {
113 113
114 process(inputFiles[i]); 114 process(inputFiles[i]);
115 115
116 } 116 }
117 117
118 } 118 }
119 119
120 protected static void process(File input) throws IOException { 120 protected static void process(File input) throws IOException {
121 121
122 System.out.println(input.getAbsoluteFile()); 122 System.out.println(input.getAbsoluteFile());
123 123
124 Html2Wiki converter = new Html2Wiki(FileUtils.readFileToString(input,null)); 124 Html2Wiki converter = new Html2Wiki(FileUtils.readFileToString(input, null));
125 125
126 WikiChapter[] chapters = converter.getWikiChapters(); 126 WikiChapter[] chapters = converter.getWikiChapters();
127 127
128 System.out.format("Writing %d wiki files...\n",chapters.length); 128 System.out.format("Writing %d wiki files...\n", chapters.length);
129 129
130 StringBuffer wikiIndex = new StringBuffer(); 130
131 wikiIndex.append("Contents:\n\n");
132
133 for (int i = 0; i < chapters.length; i++) { 131 for (int i = 0; i < chapters.length; i++) {
134 132
135 wikiIndex.append("# [[" + chapters[i].getChapterName() + "]]\n");
136 FileUtils.writeStringToFile(new File(input.getParent(), 133 FileUtils.writeStringToFile(new File(input.getParent(),
137 generateFilename(chapters[i].getChapterName())+".wiki"), 134 generateFilename(chapters[i].getChapterName()) + ".wiki"),
138 chapters[i].getContents().toString(), 135 chapters[i].getContents().toString(),
139 null); 136 null);
140 137
141 } 138 }
142 System.out.println("Writing wikiIndex..."); 139
143 140 }
144 FileUtils.writeStringToFile(new File(FilenameUtils.removeExtension(input.getPath())+".wikiIndex"),wikiIndex.toString(),null); 141
145 }
146
147 public static String generateFilename(String input) { 142 public static String generateFilename(String input) {
148 return input.replaceAll("\\\\|/|:|\\(|\\)","-").replace("<br>", ""); 143 return input.replaceAll("\\\\|/|:|\\(|\\)", "-").replace("<br>", "");
149 144
150 } 145 }
146
151 public String getWikiText() { 147 public String getWikiText() {
152 convert(); 148 convert();
153 return buffer.toString(); 149 return buffer.toString();
154 } 150 }
155 151
156 public WikiChapter[] getWikiChapters() { 152 public WikiChapter[] getWikiChapters() {
157 153
158 convert(); 154 convert();
159 155
160 List<WikiChapter> chapters = new ArrayList<WikiChapter>(); 156 List<WikiChapter> chapters = new ArrayList<WikiChapter>();
161 157
162 Pattern chapterPat = Pattern.compile("<chapter>"); 158 Pattern chapterPat = Pattern.compile("<chapter>");
163 Matcher begin = chapterPat.matcher(buffer); 159 Matcher begin = chapterPat.matcher(buffer);
164 Matcher end = chapterPat.matcher(buffer); 160 Matcher end = chapterPat.matcher(buffer);
165 161
166 while(begin.find()) { 162 while (begin.find()) {
167 163
168 164
169 end.find(begin.end()); 165 end.find(begin.end());
170 166
171 Pattern chapterNamePat = Pattern.compile("<chapter>(.*?)</chapter>"); 167 Pattern chapterNamePat = Pattern.compile("<chapter>(.*?)</chapter>");
172 168
173 Matcher chapterNameMatcher = chapterNamePat.matcher(buffer); 169 Matcher chapterNameMatcher = chapterNamePat.matcher(buffer);
174 170
175 String chapterName = chapterNameMatcher.find(begin.start()) ? chapterNameMatcher.group(1) : null; 171 String chapterName = chapterNameMatcher.find(begin.start()) ? chapterNameMatcher.group(1) : null;
176 172
177 CharSequence contents = buffer.subSequence(chapterName == null ? begin.start() : chapterNameMatcher.end() 173 CharSequence contents = buffer.subSequence(chapterName == null ? begin.start() : chapterNameMatcher.end(), end.hitEnd() ? buffer.length() : end.start());
178 ,end.hitEnd() ? buffer.length() : end.start()); 174
179 175 chapters.add(new WikiChapter(chapterName, contents));
180 chapters.add(new WikiChapter(chapterName,contents)); 176
181 177 }
182 } 178 return (WikiChapter[]) chapters.toArray(new WikiChapter[]{});
183 return (WikiChapter[])chapters.toArray(new WikiChapter[]{}); 179 }
184 } 180
185
186 private void convert() { 181 private void convert() {
187 182
188 if(!converted) { 183 if (!converted) {
189 for (Transformer t : transformers) { 184 for (Transformer t : transformers) {
190 185
191 System.out.println(".Applying: " + t); 186 System.out.println(".Applying: " + t);
192 t.apply(buffer); 187 t.apply(buffer);
193 188
194 } 189 }
195 } 190 }
196 converted = true; 191 converted = true;
197 } 192 }
198 193
199 private static class HtmlFileFilter implements FileFilter { 194 private static class HtmlFileFilter implements FileFilter {
195
200 public boolean accept(File pathname) { 196 public boolean accept(File pathname) {
201 return pathname.getName().toLowerCase().matches("^.*\\.html$"); 197 return pathname.getName().toLowerCase().matches("^.*\\.html$");
202 } 198 }
203 199 }
204 } 200
205 protected static class WikiChapter { 201 protected static class WikiChapter {
202
206 private String chapterName; 203 private String chapterName;
207 private CharSequence contents; 204 private CharSequence contents;
208 205
209 public WikiChapter(String chapterName, CharSequence contents) { 206 public WikiChapter(String chapterName, CharSequence contents) {
210 this.chapterName = chapterName.replaceAll("\\\\|/|:|\\(|\\)","-").replaceAll("\\s+"," ").replaceAll("&amp;","and"); 207 this.chapterName = chapterName.replaceAll("\\\\|/|:|\\(|\\)", "-").replaceAll("\\s+", " ").replaceAll("&amp;", "and");
211 208
212 this.contents = contents; 209 this.contents = contents;
213 } 210 }
214 211
215 public String getChapterName() { 212 public String getChapterName() {
216 return chapterName; 213 return chapterName;
217 } 214 }
218 215
219 public CharSequence getContents() { 216 public CharSequence getContents() {
220 return contents; 217 return contents;
221 } 218 }
222 219
223 public String toString() { 220 public String toString() {
224 return "Chapter: " + chapterName + " Content length: " + contents.length(); 221 return "Chapter: " + chapterName + " Content length: " + contents.length();
225 } 222 }
226 } 223 }
227
228 } 224 }