# HG changeset patch # User smith@nwoca.org # Date 1296164247 18000 # Node ID 99f293bd507f2dc9e03079611d580d5b1a6bbdb8 # Parent d34f4d408ef9814ccd8a0ec57e5859b164cff4da Add "reflow" transformer to reflow paragraphs, list items, etc. diff -r d34f4d408ef9 -r 99f293bd507f src/org/nwoca/ssdt/tools/html2wiki/Html2Wiki.java --- a/src/org/nwoca/ssdt/tools/html2wiki/Html2Wiki.java Thu Jan 27 14:32:34 2011 -0500 +++ b/src/org/nwoca/ssdt/tools/html2wiki/Html2Wiki.java Thu Jan 27 16:37:27 2011 -0500 @@ -10,8 +10,6 @@ import java.util.Collection; import java.util.ArrayList; import java.util.List; -import java.util.Iterator; -import org.apache.commons.io.FileSystemUtils; import org.apache.commons.io.FileUtils; import java.util.regex.*; import org.apache.commons.io.FilenameUtils; @@ -26,12 +24,12 @@ * @author SMITH */ public class Html2Wiki { - + private StringBuffer buffer; private Collection transformers; private boolean converted = false; private static String category; - + /** Creates a new instance of Html2Wiki. */ public Html2Wiki(String html) { buffer = new StringBuffer(html); @@ -43,12 +41,13 @@ transformers.add(new DeleteTransformer("|")); transformers.add(new DeleteTransformer("(?m)^\\*")); // transformers.add(new DeleteTransformer("
|
")); - transformers.add(new DeleteTransformer("

")); transformers.add(new DeleteTransformer("(?m)
$")); transformers.add(new DeleteTransformer("|")); transformers.add(new CloseTagTransformer("

  • ","(\n|\r)*(
  • |||","{ul}")); transformers.add(new ReplaceTransformer("
  • ","{li}")); transformers.add(new ReplaceTransformer("
  • ","{li}\n")); - + transformers.add(new ChapterTransformer(category)); transformers.add(new TagTransformer("
    (.*?)
    ", true, "{code}","{code}")); transformers.add(new TagTransformer("
    (.*?)
    ", true, "{center}","{center}")); @@ -85,14 +84,15 @@ // transformers.add(new TagTransformer("(\\S)\\s\\n", "", " ")); transformers.add(new TagTransformer("
    (.*)
    ", "{quote}", "{quote}")); transformers.add(new DeleteTransformer("(?s)")); - + transformers.add(new ReflowTransformer("(\\{note\\})([^\\{]*)(\\{note\\})")); + } - + /** * @param args the command line arguments */ public static void main(String[] args) throws IOException { - + if (args.length == 0) { System.out.println("Usage:"); System.out.println(" Html2Wiki {inputDirectory} [Category]"); @@ -101,128 +101,124 @@ System.out.println(" Each 'chapter' written to *.wiki"); return; } - + File inputs = new File(args[0]); - + if (args.length > 1) { category = args[1]; } - + File[] inputFiles = inputs.listFiles(new HtmlFileFilter()); for (int i = 0; i < inputFiles.length; i++) { - + process(inputFiles[i]); - + } - + } - + protected static void process(File input) throws IOException { - + System.out.println(input.getAbsoluteFile()); - - Html2Wiki converter = new Html2Wiki(FileUtils.readFileToString(input,null)); - + + Html2Wiki converter = new Html2Wiki(FileUtils.readFileToString(input, null)); + WikiChapter[] chapters = converter.getWikiChapters(); - - System.out.format("Writing %d wiki files...\n",chapters.length); + + System.out.format("Writing %d wiki files...\n", chapters.length); - StringBuffer wikiIndex = new StringBuffer(); - wikiIndex.append("Contents:\n\n"); - + for (int i = 0; i < chapters.length; i++) { - - wikiIndex.append("# [[" + chapters[i].getChapterName() + "]]\n"); + FileUtils.writeStringToFile(new File(input.getParent(), - generateFilename(chapters[i].getChapterName())+".wiki"), - chapters[i].getContents().toString(), - null); - + generateFilename(chapters[i].getChapterName()) + ".wiki"), + chapters[i].getContents().toString(), + null); + } - System.out.println("Writing wikiIndex..."); - - FileUtils.writeStringToFile(new File(FilenameUtils.removeExtension(input.getPath())+".wikiIndex"),wikiIndex.toString(),null); + } - + public static String generateFilename(String input) { - return input.replaceAll("\\\\|/|:|\\(|\\)","-").replace("
    ", ""); - + return input.replaceAll("\\\\|/|:|\\(|\\)", "-").replace("
    ", ""); + } + public String getWikiText() { convert(); return buffer.toString(); } - + public WikiChapter[] getWikiChapters() { - + convert(); - + List chapters = new ArrayList(); - + Pattern chapterPat = Pattern.compile(""); Matcher begin = chapterPat.matcher(buffer); Matcher end = chapterPat.matcher(buffer); - - while(begin.find()) { - - + + while (begin.find()) { + + end.find(begin.end()); - + Pattern chapterNamePat = Pattern.compile("(.*?)"); - + Matcher chapterNameMatcher = chapterNamePat.matcher(buffer); - + String chapterName = chapterNameMatcher.find(begin.start()) ? chapterNameMatcher.group(1) : null; - - CharSequence contents = buffer.subSequence(chapterName == null ? begin.start() : chapterNameMatcher.end() - ,end.hitEnd() ? buffer.length() : end.start()); - - chapters.add(new WikiChapter(chapterName,contents)); - + + CharSequence contents = buffer.subSequence(chapterName == null ? begin.start() : chapterNameMatcher.end(), end.hitEnd() ? buffer.length() : end.start()); + + chapters.add(new WikiChapter(chapterName, contents)); + } - return (WikiChapter[])chapters.toArray(new WikiChapter[]{}); + return (WikiChapter[]) chapters.toArray(new WikiChapter[]{}); } - + private void convert() { - - if(!converted) { + + if (!converted) { for (Transformer t : transformers) { - + System.out.println(".Applying: " + t); t.apply(buffer); - + } } converted = true; } - + private static class HtmlFileFilter implements FileFilter { + public boolean accept(File pathname) { return pathname.getName().toLowerCase().matches("^.*\\.html$"); } - } + protected static class WikiChapter { + private String chapterName; private CharSequence contents; - + public WikiChapter(String chapterName, CharSequence contents) { - this.chapterName = chapterName.replaceAll("\\\\|/|:|\\(|\\)","-").replaceAll("\\s+"," ").replaceAll("&","and"); - + this.chapterName = chapterName.replaceAll("\\\\|/|:|\\(|\\)", "-").replaceAll("\\s+", " ").replaceAll("&", "and"); + this.contents = contents; } - + public String getChapterName() { return chapterName; } - + public CharSequence getContents() { return contents; } - + public String toString() { return "Chapter: " + chapterName + " Content length: " + contents.length(); } } - } diff -r d34f4d408ef9 -r 99f293bd507f src/org/nwoca/ssdt/tools/html2wiki/ReflowTransformer.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/org/nwoca/ssdt/tools/html2wiki/ReflowTransformer.java Thu Jan 27 16:37:27 2011 -0500 @@ -0,0 +1,53 @@ +package org.nwoca.ssdt.tools.html2wiki; + +import java.util.regex.*; + +class ReflowTransformer implements Transformer { + + private Pattern[] patterns = { + Pattern.compile("(\\n

    )(.*?)(\\n\\n|\\n<)", Pattern.MULTILINE + Pattern.DOTALL), + Pattern.compile("(

  • )(.*?)(
  • )", Pattern.MULTILINE + Pattern.DOTALL), + Pattern.compile("()([^<]*)()", Pattern.MULTILINE + Pattern.DOTALL) + }; + + /** + * Default transformer refolows paragraphs, li's and td's. + * + */ + public ReflowTransformer() { + } + + /** + * Create transformer with specific regexp. + * + * Regexp must provide three groups: (before)(textToReflow)(after). + * + * @param regexp + */ + public ReflowTransformer(String regexp) { + patterns = new Pattern[]{ + Pattern.compile(regexp, Pattern.MULTILINE + Pattern.DOTALL) + }; + } + + public void apply(StringBuffer buffer) { + for (Pattern pattern : patterns) { + + System.out.println(" Reflowing: " + pattern); + Matcher matcher = pattern.matcher(buffer); + + int start = 0; + while (matcher.find(start)) { + String temp = matcher.group(2); + temp = temp.replaceAll(" \\n", " "); + buffer.replace(matcher.start(), matcher.end(), matcher.group(1) + temp + matcher.group(3)); + start = matcher.start() + matcher.group(1).length() + temp.length() + matcher.group(3).length() - 1; + } + } + + } + + public String toString() { + return "Reflowing block tags"; + } +}