#! /usr/bin/env python import os, sys import zipfile import xml.dom.minidom def tag_epub_file(filename, new_tag_list=None, delete_tags=False) : subjectTag = 'dc:subject' print filename if not zipfile.is_zipfile(filename) : print filename, "isn't a zip file" return zf = zipfile.ZipFile(filename) content = None for f in zf.namelist() : if os.path.basename(f) == 'content.opf' : contentfile = f content = zf.open(f) break if not content : raise RuntimeException('No content.opf in %s' % filename) # Now content is a file handle on the content.opf XML file try : dom = xml.dom.minidom.parse(content) except IOError, e : raise IOError, filename + ': ' + str(e) # Tags are inside and look like this: # # Presidents -- United States -- Biography parent = None tag = None tags = dom.getElementsByTagName(subjectTag) for tag in tags : # Obviously there should be more error checking here if not parent : parent = tag.parentNode else : assert parent == tag.parentNode if delete_tags : print "Deleting:", tag.childNodes[0].wholeText tag.parentNode.removeChild(tag) else : print " ", tag.childNodes[0].wholeText # Now add new tags, if any content.close() if not new_tag_list : zf.close() return # If we didn't see a dc:subject, we still need a parent, the tag. if not parent : parent = dom.getElementsByTagName("metadata")[0] # If there's no metadata tag, maybe we should add one, # but it might be better to throw an error. # There are new tags to add. # First, add them to the DOM by cloning the last node: for new_tag in new_tag_list : # Make the new node: #newnode = tag.cloneNode(False) newnode = dom.createElement(subjectTag) # Make a text node inside it: textnode = dom.createTextNode(new_tag) newnode.appendChild(textnode) # Also add a newline after each new node textnode = dom.createTextNode('\n') # Append nodenode after the last tag node we saw: if tag and tag.nextSibling : parent.insertBefore(textnode, tag.nextSibling) parent.insertBefore(newnode, textnode) # If we didn't see a tag, or the tag was the last child # of its parent, we have to do it this way: else : parent.appendChild(newnode) parent.appendChild(textnode) print "Adding:", new_tag # Open a new zip file to write to, and copy everything # but change the content.opf to the new one: new_epub_file = 'tmp-' + filename ozf = zipfile.ZipFile(new_epub_file, 'w') for info in zf.infolist() : if os.path.basename(info.filename) == 'content.opf' : # dom.toprettyprintxml() returns unicode, which zipfile.writestr() # can't write. If you pass in encoding= then it works ... # but minidom gives us no way to find out the encoding # of the XML file we just parsed! # So the best we can do is force it to UTF-8, # barring re-opening the file and parsing the first line manually. # So crazy! encoding = 'UTF-8' ozf.writestr(info, dom.toprettyxml(encoding=encoding, newl='')) # This also works: #ozf.writestr(info, dom.toprettyxml().encode(encoding, # 'xmlcharrefreplace')) else : bytes = zf.read(info.filename) ozf.writestr(info, bytes) ozf.close() zf.close() # Now we have the new file in new_epub_file, old in filename. # Rename appropriately: os.rename(filename, filename + ".bak") os.rename(new_epub_file, filename) print "Wrote", filename # main if __name__ == "__main__" : # optparse can't handle multiple arguments of the same type # (e.g. multiple tags), and the argparse doc is impenetrable. # So let's just do this: any argument corresponding to a readable # file must be an epub filename to be read/modified; # any argument following a -t is a tag to be added; # if there's a -d anywhere, we'll delete existing tags first;; # any other flag, print a usage statement. epubfiles = [] tags = [] add_tags = False delete_tags = False for arg in sys.argv[1:] : if arg == '-d' : delete_tags = True continue if arg == '-t' : add_tags = True continue if arg[0] == '-' : print "Usage: %s file.epub [file.epub...] [-d] [-t tag1 [tag2...]]" print "" print "-d: delete existing tags before adding new ones" print "Without -t, print existing tags for all given epub files" if not add_tags : # still adding files if os.access(arg, os.R_OK) : epubfiles.append(arg) else : print "Can't read", arg, "-- skipping" else : # done adding files, adding tags now tags.append(arg) for f in epubfiles : print "=======" tag_epub_file(f, tags, delete_tags)