Mercurial > public > html2wiki
changeset 6:99f293bd507f
Add "reflow" transformer to reflow paragraphs, list items, etc.
author | smith@nwoca.org |
---|---|
date | Thu, 27 Jan 2011 16:37:27 -0500 |
parents | d34f4d408ef9 |
children | a634b4d554d4 |
files | src/org/nwoca/ssdt/tools/html2wiki/Html2Wiki.java src/org/nwoca/ssdt/tools/html2wiki/ReflowTransformer.java |
diffstat | 2 files changed, 118 insertions(+), 69 deletions(-) [+] |
line wrap: on
line diff
--- a/src/org/nwoca/ssdt/tools/html2wiki/Html2Wiki.java Thu Jan 27 14:32:34 2011 -0500 +++ b/src/org/nwoca/ssdt/tools/html2wiki/Html2Wiki.java Thu Jan 27 16:37:27 2011 -0500 @@ -10,8 +10,6 @@ import java.util.Collection; import java.util.ArrayList; import java.util.List; -import java.util.Iterator; -import org.apache.commons.io.FileSystemUtils; import org.apache.commons.io.FileUtils; import java.util.regex.*; import org.apache.commons.io.FilenameUtils; @@ -26,12 +24,12 @@ * @author SMITH */ public class Html2Wiki { - + private StringBuffer buffer; private Collection<Transformer> transformers; private boolean converted = false; private static String category; - + /** Creates a new instance of Html2Wiki. */ public Html2Wiki(String html) { buffer = new StringBuffer(html); @@ -43,12 +41,13 @@ transformers.add(new DeleteTransformer("<a .*?>|</a>")); transformers.add(new DeleteTransformer("(?m)^\\*")); // transformers.add(new DeleteTransformer("<blockquote>|</blockquote>")); - transformers.add(new DeleteTransformer("<p>")); transformers.add(new DeleteTransformer("(?m)<br>$")); transformers.add(new DeleteTransformer("<font .*?>|</font>")); transformers.add(new CloseTagTransformer("<li>","(\n|\r)*(<li>|</ul>|</ol>|<ul>|<ol>)","</li>")); transformers.add(new BadTableDataTransformer()); transformers.add(new BadTableRowTransformer()); + transformers.add(new ReflowTransformer()); + transformers.add(new DeleteTransformer("<p>")); // transformers.add(new ReplaceTransformer("</td>","\n</td>")); transformers.add(new ReplaceTransformer("\\{","\\{")); transformers.add(new ReplaceTransformer("\\}","\\}")); @@ -63,7 +62,7 @@ transformers.add(new ReplaceTransformer("<ul.*?>|</ul>","{ul}")); transformers.add(new ReplaceTransformer("<li>","{li}")); transformers.add(new ReplaceTransformer("</li>","{li}\n")); - + transformers.add(new ChapterTransformer(category)); transformers.add(new TagTransformer("<pre>(.*?)</pre>", true, "{code}","{code}")); transformers.add(new TagTransformer("<center>(.*?)</center>", true, "{center}","{center}")); @@ -85,14 +84,15 @@ // transformers.add(new TagTransformer("(\\S)\\s\\n", "", " ")); transformers.add(new TagTransformer("<blockquote>(.*)</blockquote>", "{quote}", "{quote}")); transformers.add(new DeleteTransformer("(?s)<hr.*?>")); - + transformers.add(new ReflowTransformer("(\\{note\\})([^\\{]*)(\\{note\\})")); + } - + /** * @param args the command line arguments */ public static void main(String[] args) throws IOException { - + if (args.length == 0) { System.out.println("Usage:"); System.out.println(" Html2Wiki {inputDirectory} [Category]"); @@ -101,128 +101,124 @@ System.out.println(" Each 'chapter' written to *.wiki"); return; } - + File inputs = new File(args[0]); - + if (args.length > 1) { category = args[1]; } - + File[] inputFiles = inputs.listFiles(new HtmlFileFilter()); for (int i = 0; i < inputFiles.length; i++) { - + process(inputFiles[i]); - + } - + } - + protected static void process(File input) throws IOException { - + System.out.println(input.getAbsoluteFile()); - - Html2Wiki converter = new Html2Wiki(FileUtils.readFileToString(input,null)); - + + Html2Wiki converter = new Html2Wiki(FileUtils.readFileToString(input, null)); + WikiChapter[] chapters = converter.getWikiChapters(); - - System.out.format("Writing %d wiki files...\n",chapters.length); + + System.out.format("Writing %d wiki files...\n", chapters.length); - StringBuffer wikiIndex = new StringBuffer(); - wikiIndex.append("Contents:\n\n"); - + for (int i = 0; i < chapters.length; i++) { - - wikiIndex.append("# [[" + chapters[i].getChapterName() + "]]\n"); + FileUtils.writeStringToFile(new File(input.getParent(), - generateFilename(chapters[i].getChapterName())+".wiki"), - chapters[i].getContents().toString(), - null); - + generateFilename(chapters[i].getChapterName()) + ".wiki"), + chapters[i].getContents().toString(), + null); + } - System.out.println("Writing wikiIndex..."); - - FileUtils.writeStringToFile(new File(FilenameUtils.removeExtension(input.getPath())+".wikiIndex"),wikiIndex.toString(),null); + } - + public static String generateFilename(String input) { - return input.replaceAll("\\\\|/|:|\\(|\\)","-").replace("<br>", ""); - + return input.replaceAll("\\\\|/|:|\\(|\\)", "-").replace("<br>", ""); + } + public String getWikiText() { convert(); return buffer.toString(); } - + public WikiChapter[] getWikiChapters() { - + convert(); - + List<WikiChapter> chapters = new ArrayList<WikiChapter>(); - + Pattern chapterPat = Pattern.compile("<chapter>"); Matcher begin = chapterPat.matcher(buffer); Matcher end = chapterPat.matcher(buffer); - - while(begin.find()) { - - + + while (begin.find()) { + + end.find(begin.end()); - + Pattern chapterNamePat = Pattern.compile("<chapter>(.*?)</chapter>"); - + Matcher chapterNameMatcher = chapterNamePat.matcher(buffer); - + String chapterName = chapterNameMatcher.find(begin.start()) ? chapterNameMatcher.group(1) : null; - - CharSequence contents = buffer.subSequence(chapterName == null ? begin.start() : chapterNameMatcher.end() - ,end.hitEnd() ? buffer.length() : end.start()); - - chapters.add(new WikiChapter(chapterName,contents)); - + + CharSequence contents = buffer.subSequence(chapterName == null ? begin.start() : chapterNameMatcher.end(), end.hitEnd() ? buffer.length() : end.start()); + + chapters.add(new WikiChapter(chapterName, contents)); + } - return (WikiChapter[])chapters.toArray(new WikiChapter[]{}); + return (WikiChapter[]) chapters.toArray(new WikiChapter[]{}); } - + private void convert() { - - if(!converted) { + + if (!converted) { for (Transformer t : transformers) { - + System.out.println(".Applying: " + t); t.apply(buffer); - + } } converted = true; } - + private static class HtmlFileFilter implements FileFilter { + public boolean accept(File pathname) { return pathname.getName().toLowerCase().matches("^.*\\.html$"); } - } + protected static class WikiChapter { + private String chapterName; private CharSequence contents; - + public WikiChapter(String chapterName, CharSequence contents) { - this.chapterName = chapterName.replaceAll("\\\\|/|:|\\(|\\)","-").replaceAll("\\s+"," ").replaceAll("&","and"); - + this.chapterName = chapterName.replaceAll("\\\\|/|:|\\(|\\)", "-").replaceAll("\\s+", " ").replaceAll("&", "and"); + this.contents = contents; } - + public String getChapterName() { return chapterName; } - + public CharSequence getContents() { return contents; } - + public String toString() { return "Chapter: " + chapterName + " Content length: " + contents.length(); } } - }
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/org/nwoca/ssdt/tools/html2wiki/ReflowTransformer.java Thu Jan 27 16:37:27 2011 -0500 @@ -0,0 +1,53 @@ +package org.nwoca.ssdt.tools.html2wiki; + +import java.util.regex.*; + +class ReflowTransformer implements Transformer { + + private Pattern[] patterns = { + Pattern.compile("(\\n<p>)(.*?)(\\n\\n|\\n<)", Pattern.MULTILINE + Pattern.DOTALL), + Pattern.compile("(<li>)(.*?)(</li>)", Pattern.MULTILINE + Pattern.DOTALL), + Pattern.compile("(<td>)([^<]*)(</td>)", Pattern.MULTILINE + Pattern.DOTALL) + }; + + /** + * Default transformer refolows paragraphs, li's and td's. + * + */ + public ReflowTransformer() { + } + + /** + * Create transformer with specific regexp. + * + * Regexp must provide three groups: (before)(textToReflow)(after). + * + * @param regexp + */ + public ReflowTransformer(String regexp) { + patterns = new Pattern[]{ + Pattern.compile(regexp, Pattern.MULTILINE + Pattern.DOTALL) + }; + } + + public void apply(StringBuffer buffer) { + for (Pattern pattern : patterns) { + + System.out.println(" Reflowing: " + pattern); + Matcher matcher = pattern.matcher(buffer); + + int start = 0; + while (matcher.find(start)) { + String temp = matcher.group(2); + temp = temp.replaceAll(" \\n", " "); + buffer.replace(matcher.start(), matcher.end(), matcher.group(1) + temp + matcher.group(3)); + start = matcher.start() + matcher.group(1).length() + temp.length() + matcher.group(3).length() - 1; + } + } + + } + + public String toString() { + return "Reflowing block tags"; + } +}