2004年11月18日星期四

How to convert Friends.doc into a CHM file (1)

Part1: Friends.doc -> friends.htm -> friends-thin.htm

1. Open Friends.doc with MS Word and save it into a html file (friends.html). I used Office XP,
and due the garbage info added by M Word, the output file is about 28M!

2. Write a simple script (friends-diet.html) to get rid of the garbage attributes
generated by evil M word, include 'class', 'style' and 'lang' etc.
This would cut the size 75% off!
#!/bin/python

from sgmllib import SGMLParser
import htmlentitydefs
import os, sys

class FriendsDiet(SGMLParser):
def reset(self):
self.output=open("friends-thin.html", "w"[img]/images/wink.gif[/img]
SGMLParser.reset(self)

def unknown_starttag(self, tag, attrs):
if tag=='p':
self.output.write("
n"[img]/images/wink.gif[/img]

elif tag=='span' or tag=='o':
pass
elif tag=='o:SmartTagType' or tag=='SmartTagType':
print "Ignore",tag
pass
else:
strattrs=""
for key, value in attrs:
if (key!='class' and key!='style' and key!='lang' and key[0:5]!='xmlns'):
strattrs = strattrs + ' %s="%s"' % (key, value)

self.output.write("<%s%s>" % (tag, strattrs))

if tag=='body':
self.output.write('n')

def unknown_endtag(self, tag):
if tag!='span' and tag!='p' and tag[0:2]!='o:':
self.output.write("n" % tag)

def handle_data(self, text):
if text.strip()!='':
self.output.write(text+"n"[img]/images/wink.gif[/img]

def handle_charref(self, ref):
self.output.write("&#%s;" % ref)

def handle_entityref(self, ref):
semicolon=""
if htmlentitydefs.entitydefs.has_key(ref):
semicolon=";"
self.output.write("&%s%s" % (ref, semicolon))

if __name__=='__main__':
import sys
parser=FriendsDiet()
#fh=open(sys.argv[1], "r"[img]/images/wink.gif[/img]
fh=open("friends.htm", "r"[img]/images/wink.gif[/img]
content=fh.read()
parser.feed(content)
fh.close()
parser.close()

没有评论: