ieure · spiralman · Dec 5, 2013 · Dec 6, 2013 · Dec 6, 2013 · Dec 6, 2013
diff --git a/Makefile b/Makefile
@@ -25,7 +25,7 @@ NORMAL_MARK := $(BUILD_DIR)/normal.mark
 TEX_FILES := $(wildcard $(LATEX_DIR)/*.tex)
 SVG_FILES := $(patsubst %.tex,%.svg,$(subst $(LATEX_DIR),$(IMAGES_DIR),$(TEX_FILES)))
 GIF_FILES := $(patsubst %.tex,%.gif,$(subst $(LATEX_DIR),$(IMAGES_DIR),$(TEX_FILES)))
-CONTENT := $(shell find src -type f -not -name .DS_Store) src/mimetype
+CONTENT := $(shell find src -type f -not -name .DS_Store -not -name \*.tex -not -name \*.sty -not -name \*book-Z-H-4.html) src/mimetype
 XML     := $(shell find src -type f -name \*html)
 
 
@@ -44,6 +44,7 @@ check:
 	xmllint --noout $(XML)
 
 $(BUILD_DIR)/huge/%_cropped.pdf: $(BUILD_DIR)/huge/ $(LATEX_DIR)/%.tex
+	@echo "Building huge $@"
 	sed 's/\\sicpsize}{\\fontsize{16}{18}/\\sicpsize}{\\fontsize{200}{220}/' < $(LATEX_DIR)/sicpstyle.sty > $(LATEX_DIR)/sicpstyle2.sty
 	mv $(LATEX_DIR)/sicpstyle2.sty $(LATEX_DIR)/sicpstyle.sty
 	cd $(LATEX_DIR) && pdflatex -output-dir ./build/huge/ ./$*.tex
@@ -58,14 +59,18 @@ $(IMAGES_DIR)/%.svg: $(BUILD_DIR)/huge/%.pbm
 
 
 $(BUILD_DIR)/%_cropped.pdf: $(BUILD_DIR) $(LATEX_DIR)/%.tex
+	@echo "Building regular"
+	@echo "$(CONTENT)"
+	false
 	sed 's/\\sicpsize}{\\fontsize{200}{220}/\\sicpsize}{\\fontsize{16}{18}/' < $(LATEX_DIR)/sicpstyle.sty > $(LATEX_DIR)/sicpstyle2.sty
 	mv $(LATEX_DIR)/sicpstyle2.sty $(LATEX_DIR)/sicpstyle.sty
 	cd $(LATEX_DIR) && pdflatex -output-dir ./build ./$*.tex
 	pdfcrop --clip $(BUILD_DIR)/$*.pdf $(BUILD_DIR)/$*_cropped.pdf
 	rm -f $(NORMAL_MARK)
 
-$(IMAGES_DIR)/%.gif: $(BUILD_DIR)/%_cropped.pdf
-	convert $(BUILD_DIR)/$*_cropped.pdf $@
+# $(IMAGES_DIR)/%.gif: $(BUILD_DIR)/%_cropped.pdf
+# 	@echo "Converting $@"
+# 	convert $(BUILD_DIR)/$*_cropped.pdf $@
 
 $(BUILD_DIR):
 	mkdir -p $(BUILD_DIR)
@@ -79,4 +84,4 @@ svg: $(SVG_FILES)
 gif: $(GIF_FILES)
 
 clean:
-	rm -rf sicp.epub $(BUILD_DIR) $(SVG_FILES) $(GIF_FILES) src/mimetype
+	rm -rf sicp.epub $(BUILD_DIR) $(SVG_FILES) src/mimetype
diff --git a/error_counts.txt b/error_counts.txt
@@ -0,0 +1,16 @@
+ 103 element "p" not allowed here; expected the element end-tag, text or element "a", "abbr", "acronym", "applet", "b", "bdo", "big", "br", "cite", "code", "del", "dfn", "em", "i", "iframe", "img", "ins", "kbd", "map", "noscript", "ns:svg", "object", "q", "samp", "script", "small", "span", "strong", "sub", "sup", "tt" or "var" (with xmlns:ns="http://www.w3.org/2000/svg")
+ 101 element "div" not allowed here; expected the element end-tag, text or element "a", "abbr", "acronym", "applet", "b", "bdo", "big", "br", "cite", "code", "del", "dfn", "em", "i", "iframe", "img", "ins", "kbd", "map", "noscript", "ns:svg", "object", "q", "samp", "script", "small", "span", "strong", "sub", "sup", "tt" or "var" (with xmlns:ns="http://www.w3.org/2000/svg")
+  94 element "caption" not allowed here; expected the element end-tag or element "tr"
+  50 text not allowed here; expected the element end-tag or element "address", "blockquote", "del", "div", "dl", "h1", "h2", "h3", "h4", "h5", "h6", "hr", "ins", "noscript", "ns:svg", "ol", "p", "pre", "script", "table" or "ul" (with xmlns:ns="http://www.w3.org/2000/svg")
+  39 text not allowed here; expected element "address", "blockquote", "del", "div", "dl", "h1", "h2", "h3", "h4", "h5", "h6", "hr", "ins", "noscript", "ns:svg", "ol", "p", "pre", "script", "table" or "ul" (with xmlns:ns="http://www.w3.org/2000/svg")
+  35 element "u" not allowed anywhere; expected the element end-tag, text or element "a", "abbr", "acronym", "applet", "b", "bdo", "big", "br", "cite", "code", "del", "dfn", "em", "i", "iframe", "img", "ins", "kbd", "map", "noscript", "ns:svg", "object", "q", "samp", "script", "small", "span", "strong", "sub", "sup", "tt" or "var" (with xmlns:ns="http://www.w3.org/2000/svg")
+  35 element "a" not allowed here; expected element "address", "blockquote", "del", "div", "dl", "h1", "h2", "h3", "h4", "h5", "h6", "hr", "ins", "noscript", "ns:svg", "ol", "p", "pre", "script", "table" or "ul" (with xmlns:ns="http://www.w3.org/2000/svg")
+  28 element "em" not allowed here; expected the element end-tag or element "address", "blockquote", "del", "div", "dl", "h1", "h2", "h3", "h4", "h5", "h6", "hr", "ins", "noscript", "ns:svg", "ol", "p", "pre", "script", "table" or "ul" (with xmlns:ns="http://www.w3.org/2000/svg")
+  10 element "font" not allowed anywhere; expected the element end-tag, text or element "a", "abbr", "acronym", "applet", "b", "bdo", "big", "br", "cite", "code", "del", "dfn", "em", "i", "iframe", "img", "ins", "kbd", "map", "noscript", "ns:svg", "object", "q", "samp", "script", "small", "span", "strong", "sub", "sup", "tt" or "var" (with xmlns:ns="http://www.w3.org/2000/svg")
+  10 element "a" not allowed here; expected element "li"
+   8 element "blockquote" incomplete; expected element "address", "blockquote", "del", "div", "dl", "h1", "h2", "h3", "h4", "h5", "h6", "hr", "ins", "noscript", "ns:svg", "ol", "p", "pre", "script", "table" or "ul" (with xmlns:ns="http://www.w3.org/2000/svg")
+   7 element "tt" not allowed here; expected element "address", "blockquote", "del", "div", "dl", "h1", "h2", "h3", "h4", "h5", "h6", "hr", "ins", "noscript", "ns:svg", "ol", "p", "pre", "script", "table" or "ul" (with xmlns:ns="http://www.w3.org/2000/svg")
+   5 element "a" not allowed here; expected the element end-tag or element "address", "blockquote", "del", "div", "dl", "h1", "h2", "h3", "h4", "h5", "h6", "hr", "ins", "noscript", "ns:svg", "ol", "p", "pre", "script", "table" or "ul" (with xmlns:ns="http://www.w3.org/2000/svg")
+   4 element "tt" not allowed here; expected the element end-tag or element "address", "blockquote", "del", "div", "dl", "h1", "h2", "h3", "h4", "h5", "h6", "hr", "ins", "noscript", "ns:svg", "ol", "p", "pre", "script", "table" or "ul" (with xmlns:ns="http://www.w3.org/2000/svg")
+   3 element "p" not allowed here; expected element "li"
+   2 element "sub" not allowed here; expected the element end-tag or element "address", "blockquote", "del", "div", "dl", "h1", "h2", "h3", "h4", "h5", "h6", "hr", "ins", "noscript", "ns:svg", "ol", "p", "pre", "script", "table" or "ul" (with xmlns:ns="http://www.w3.org/2000/svg")
diff --git a/fetch_images.sh b/fetch_images.sh
@@ -0,0 +1,13 @@
+#! /bin/bash
+
+SRC_DIR="src/OEBPS/"
+IMAGE_SRC="http://mitpress.mit.edu/sicp/full-text/book/"
+
+for img in `grep "media-type=\"image/" ${SRC_DIR}content.opf | sed "s/.*href=\"\([^\"]*\)\".*/\1/"`; do
+    destination_file="${SRC_DIR}${img}"
+    if [ ! -e "${destination_file}" ]; then
+	image_file=`echo "${img}" | sed "s/^images\/\(.*\)/\1/"`
+	echo ${destination_file}
+	curl -s -o "${destination_file}" "${IMAGE_SRC}${image_file}"
+    fi
+done
diff --git a/fix_toc.py b/fix_toc.py
@@ -0,0 +1,114 @@
+from collections import defaultdict, Counter
+from HTMLParser import HTMLParser
+import os.path
+import re
+import sys
+from urlparse import urldefrag
+from xml.etree import ElementTree
+
+ncx_namespace = 'http://www.daisy.org/z3986/2005/ncx/'
+
+ElementTree.register_namespace('', ncx_namespace)
+
+toc_file = sys.argv[1]
+content_dir, _ = os.path.split(toc_file)
+
+found_sections = defaultdict(Counter)
+
+
+def _tag_name(name):
+    return '{{{0}}}{1}'.format(ncx_namespace, name)
+
+
+def _section_id(section_number):
+    return '__sec_' + section_number
+
+
+def subsection_source(content_src, subsection_number):
+    section_number = subsection_match.group(1)
+    section_id = _section_id(section_number)
+
+    new_src = '#'.join([content_src, section_id])
+    return new_src
+
+
+class SectionFinder(HTMLParser):
+    def __init__(self, src_file, section_title):
+        HTMLParser.__init__(self)
+        self.last_id = None
+        self.new_src = None
+        self.consume_text = False
+        self.current_text = ''
+        self.section_title = section_title
+        self.src_file = src_file
+        self.section_title_index = found_sections[src_file][section_title]
+        self.found_sections = 0
+
+    def build_src(self, anchor_id):
+        return '#'.join([self.src_file, anchor_id])
+
+    def handle_starttag(self, tag, attrs):
+        attrs = dict(attrs)
+        if tag == 'a' and 'id' in attrs:
+            self.last_id = attrs['id']
+        elif self.last_id is not None and tag.startswith('h'):
+            self.consume_text = True
+
+    def handle_endtag(self, tag):
+        if self.last_id and tag.startswith('h'):
+            if self.current_text == self.section_title:
+                if self.found_sections == self.section_title_index:
+                    self.new_src = self.build_src(self.last_id)
+                    found_sections[self.src_file][self.section_title] += 1
+
+                self.found_sections += 1
+
+            self.last_id = None
+            self.consume_text = False
+            self.current_text = ''
+
+    def handle_data(self, data):
+        if self.consume_text:
+            self.current_text += data
+
+
+def find_content_source(section_title, content_src):
+    source_path = os.path.join(content_dir, content_src)
+    section_finder = SectionFinder(content_src, section_title)
+
+    with open(source_path) as source_file:
+        section_finder.feed(source_file.read())
+
+    return section_finder.new_src
+
+
+doc = ElementTree.parse(toc_file)
+
+for nav_point in doc.iter(_tag_name('navPoint')):
+    text_node = nav_point.find(_tag_name('navLabel')).find(_tag_name('text'))
+    content_node = nav_point.find(_tag_name('content'))
+
+    if text_node is None or content_node is None:
+        continue
+
+    old_src = content_node.get('src')
+    content_src, fragment = urldefrag(old_src)
+
+    if not fragment:
+        continue
+
+    subsection_match = re.match(r'^([0-9.]+)', text_node.text)
+    if subsection_match:
+        new_src = subsection_source(content_src, subsection_match.group(1))
+    else:
+        new_src = find_content_source(text_node.text, content_src)
+
+    if new_src is None:
+        print 'Cannot find reference for {0} in {1}'.format(text_node.text, content_src)
+        continue
+
+    print old_src, '->', new_src
+
+    content_node.set('src', new_src)
+
+doc.write(toc_file + '-new', xml_declaration=True, encoding='UTF-8')
diff --git a/src/OEBPS/book-Z-H-1.html b/src/OEBPS/book-Z-H-1.html
@@ -5,15 +5,15 @@
      (c) Dorai Sitaram, http://www.cs.rice.edu/~dorai/tex2page -->
   <head>
     <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
-    <meta http-equiv="Content-Type: text/html; charset=utf-8" />
+    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
     <title>Structure and Interpretation of Computer Programs</title>
     <link rel="stylesheet" type="text/css" href="book-Z-C.css" title="default" />
   </head>
   <body>
-<a name="titlepage" id="titlepage"></a>
+<a id="titlepage"></a>
 <h1>Structure and Interpretation<br />
 of Computer Programs</h1>
-<div align="left">second edition </div>
+<div style="text-align: left;">second edition </div>
 
 <p style="padding-top:7em;">Harold Abelson and Gerald Jay Sussman <br />
 with Julie Sussman </p>