Turning a WordPress Blog into a Book: Part One

Lately my lovely wife has been asking me to help her in the task of turning her blog (a WordPress installation like this one that I maintain for her) into a book. It’s a pretty daunting task, since her blog would easily surpass 1,000 pages if printed. After a few days of trying out different services and trying a variety of search terms, I just couldn’t find a good match for her blog.

Most services that turn blogs into books either require you to use one of the big hosting sites (like yourblog.wordpress.com), and of those that will take a simple WordPress XML export, the only one I could find that could handle the enormous file was a site called FastPencil. A problem that was universal, though, was finding a service that would import the file AND retain some semblance of layout in regard to pictures and text, block-quotes, centering, etc.

Then I discovered a project called WPTEX. It is a collection of PHP scripts that asks a few questions and then converts (or, attempts to convert) a WordPress blog to LaTeX, which can in turn be converted to a PDF using PDFLaTeX. The only problem was WPTEX didn’t really do a graceful job with my wife’s blog, which is rich in pictures and special formatting.

By this time I had enough information to decide to forge out on my own. I started by writing some C# that would read a WordPress export file (quick and dirty-like) into some data structures. Then I went off looking for an HTML-to-LaTeX converter online. I discovered Pandoc. After installing the Haskell-based markup converter, I started using it to convert my wife’s post titles and content to LaTeX. It was a brilliant success. I then wrote some code to recognize when it needed to download an image to include in the book and to strip out the obvious hyperlinks. Now we’re in the process of manually doing a lot of the centering and layout changes that were so cumbersome using those online services. To my surprise, LaTeX has been very easy to work with. In part two I’ll cover some of the (did I say dirty?) code that made it all come together.

2 Comments

  1. emre says:

    So how did you do it? Could you share the code?

  2. Dave says:

    I am going to post the code here for you, but be warned I haven’t done much to document or genericize anything. This worked specifically for my wife’s blog and took some tinkering:

    using System;
    using System.Collections.Generic;
    using System.Linq;
    using System.Text;
    using System.Xml;
    using System.IO;
    using System.Net;
    using System.Web;
    using System.Diagnostics;
    using System.Threading;

    namespace WordPressTools.ConsoleTest {
    class Program {
    static void Main(string[] args) {
    CollectedDocumentInfo docInfo = new CollectedDocumentInfo();
    docInfo.Posts = new List();
    PostInfo currentPost = null;
    using (XmlTextReader loader = new XmlTextReader(File.Open(@”path-to-wordpress-export.xml”, FileMode.Open))) {
    if (loader.ReadToDescendant(“channel”)) {
    while (loader.Read()) {

    if (!loader.IsStartElement()) {
    continue;
    }

    if (loader.Name == “title” && loader.Depth == 2) {
    docInfo.Title = loader.ReadString();
    }

    if (loader.Name == “description” && loader.Depth == 2) {
    docInfo.SubTitle = loader.ReadString();
    }

    if (loader.Name == “wp:author_display_name” && loader.Depth == 3) {
    docInfo.Author = loader.ReadString();
    }

    if (loader.Name == “item” && loader.Depth == 2) {
    if (currentPost != null && String.IsNullOrWhiteSpace(currentPost.Content)) {
    docInfo.Posts.Remove(currentPost);
    }
    currentPost = new PostInfo();
    docInfo.Posts.Add(currentPost);
    }

    if (loader.Name == “title” && loader.Depth == 3) {
    currentPost.Title = loader.ReadString();
    }

    if (loader.Name == “content:encoded” && loader.Depth == 3) {
    currentPost.Content = loader.ReadString();
    }

    if (loader.Name == “wp:post_date” && loader.Depth == 3) {
    currentPost.Date = DateTime.Parse(loader.ReadString());
    }

    if (loader.Name == “wp:status” && loader.Depth == 3) {
    string status = loader.ReadString();
    if (status != “publish”) {
    currentPost.Content = null;
    }
    }
    }
    }
    }

    Console.WriteLine(String.Empty);
    using (StreamWriter sw = new StreamWriter(@”c:\fftex\ff.tex”, false)) {
    sw.WriteLine(@”\documentclass[11pt]{article}
    \usepackage{graphicx}
    \usepackage{hyperref}
    \usepackage[normalem]{ulem}
    % Default margins are too wide all the way around. I reset them here
    \setlength{\topmargin}{-.5in}
    \setlength{\textheight}{9in}
    \setlength{\oddsidemargin}{.125in}
    \setlength{\textwidth}{6.25in}
    \begin{document}
    \title{God’s Faithfulness Through Infertility}
    \author{Part 1 – (Some name for Part 1)\\Elaine Sheldon}
    \maketitle
    “”‘For I know the plans I have for you,’ declares the LORD, ‘plans to prosper you and not to harm you, plans to give you hope and a future.”” Jeremiah 29:11

    “);
    foreach (PostInfo post in docInfo.Posts) {
    Console.WriteLine(post.Title + ” – ” + post.Date.ToShortDateString());

    // ok, I need to go through this post and process the content…

    post.Title = System.Web.HttpUtility.HtmlDecode(post.Title).Replace(“&”, “\\&”).Replace(“#”, “\\#”);
    string contentHtml = post.Content;

    if (File.Exists(panDocInputPath)) {
    File.Delete(panDocInputPath);
    }

    if (File.Exists(panDocOutputPath)) {
    File.Delete(panDocOutputPath);
    }

    File.WriteAllText(panDocInputPath, contentHtml);

    ProcessStartInfo psiPanDoc = new ProcessStartInfo(panDocPath, panDocInputPath + ” -f html -t latex -o ” + panDocOutputPath);
    psiPanDoc.WindowStyle = ProcessWindowStyle.Hidden;

    Process pPanDoc = Process.Start(psiPanDoc);

    while (pPanDoc.HasExited) {
    Thread.Sleep(50);
    }
    Thread.Sleep(100);
    #region remove those image links and download html images to local folder
    string searchString = @”\includegraphics{“;
    using (StreamReader texIn = new StreamReader(panDocOutputPath)) {
    using (StreamWriter texOut = new StreamWriter(fixedOutputPath)) {
    int line = 0;
    while (texIn.EndOfStream == false) {
    string myLine = texIn.ReadLine();

    string[] linesDerived = myLine.Split(new string[] { “\\\\” }, StringSplitOptions.None);
    foreach (string lineFound in linesDerived) {
    myLine = lineFound;

    line++;

    if (myLine.StartsWith(“\\href{“) && myLine.EndsWith(“}}”)) {
    myLine = myLine.Substring(myLine.IndexOf(‘}’) + 2);
    myLine = myLine.Substring(0, myLine.Length – 1);
    }

    int indexOfGraphics = myLine.IndexOf(searchString);
    if (indexOfGraphics > -1) {
    int indexToStart = indexOfGraphics + searchString.Length;

    string webFile = myLine.Substring(indexToStart);
    webFile = webFile.Substring(0, webFile.IndexOf(‘}’));
    string localFile = “./” + webFile.Substring(webFile.LastIndexOf(‘/’) + 1).Replace(“?imgmax=800”, “”).Replace(“%”,””);

    string downloadFile = @”c:\fftex\” + localFile.Substring(2);
    try {
    if (!File.Exists(downloadFile)) {
    WebClient downloader = new WebClient();
    downloader.DownloadFile(webFile, downloadFile);
    }
    }
    catch (Exception ex) {
    continue;
    }
    myLine = myLine.Replace(webFile, localFile);
    }
    texOut.WriteLine(myLine);
    }
    }
    }
    }
    #endregion remove those image links

    string contentLatex = File.ReadAllText(fixedOutputPath);

    sw.WriteLine(String.Format(@”\section{{{0} – {1}}}

    {2}”, post.Title, post.Date.ToShortDateString(), contentLatex));
    }
    sw.Write(@”\end{document}”);
    }

    Console.WriteLine(String.Empty);
    Console.WriteLine(“Press enter to exit…”);

    Console.ReadLine();
    }
    static string panDocInputPath = @”c:\pan-in.txt”;
    static string panDocOutputPath = @”c:\pan-out.txt”;
    static string fixedOutputPath = @”c:\pan-out-fixed.txt”;
    static string panDocPath = @”c:\program files (x86)\pandoc\bin\pandoc.exe”;
    }

    class CollectedDocumentInfo {
    public string Title { get; set; }
    public string SubTitle { get; set; }
    public string Author { get; set; }
    public List Posts { get; set; }
    }

    class PostInfo {
    public string Title { get; set; }
    public DateTime Date { get; set; }
    public string Content { get; set; }
    }
    }

Leave a Reply