Part1: Friends.doc -> friends.htm -> friends-thin.htm
1. Open Friends.doc with MS Word and save it into a html file (friends.html). I used Office XP,
and due the garbage info added by M Word, the output file is about 28M!
2. Write a simple script (friends-diet.html) to get rid of the garbage attributes
generated by evil M word, include 'class', 'style' and 'lang' etc.
This would cut the size 75% off!
#!/bin/python
from sgmllib import SGMLParser
import htmlentitydefs
import os, sys
class FriendsDiet(SGMLParser):
def reset(self):
self.output=open("friends-thin.html", "w"[img]/images/wink.gif[/img]
SGMLParser.reset(self)
def unknown_starttag(self, tag, attrs):
if tag=='p':
self.output.write("
n"[img]/images/wink.gif[/img]
elif tag=='span' or tag=='o':
pass
elif tag=='o:SmartTagType' or tag=='SmartTagType':
print "Ignore",tag
pass
else:
strattrs=""
for key, value in attrs:
if (key!='class' and key!='style' and key!='lang' and key[0:5]!='xmlns'):
strattrs = strattrs + ' %s="%s"' % (key, value)
self.output.write("<%s%s>" % (tag, strattrs))
if tag=='body':
self.output.write('n')
def unknown_endtag(self, tag):
if tag!='span' and tag!='p' and tag[0:2]!='o:':
self.output.write("%s>n" % tag)
def handle_data(self, text):
if text.strip()!='':
self.output.write(text+"n"[img]/images/wink.gif[/img]
def handle_charref(self, ref):
self.output.write("&#%s;" % ref)
def handle_entityref(self, ref):
semicolon=""
if htmlentitydefs.entitydefs.has_key(ref):
semicolon=";"
self.output.write("&%s%s" % (ref, semicolon))
if __name__=='__main__':
import sys
parser=FriendsDiet()
#fh=open(sys.argv[1], "r"[img]/images/wink.gif[/img]
fh=open("friends.htm", "r"[img]/images/wink.gif[/img]
content=fh.read()
parser.feed(content)
fh.close()
parser.close()
1. Open Friends.doc with MS Word and save it into a html file (friends.html). I used Office XP,
and due the garbage info added by M Word, the output file is about 28M!
2. Write a simple script (friends-diet.html) to get rid of the garbage attributes
generated by evil M word, include 'class', 'style' and 'lang' etc.
This would cut the size 75% off!
#!/bin/python
from sgmllib import SGMLParser
import htmlentitydefs
import os, sys
class FriendsDiet(SGMLParser):
def reset(self):
self.output=open("friends-thin.html", "w"[img]/images/wink.gif[/img]
SGMLParser.reset(self)
def unknown_starttag(self, tag, attrs):
if tag=='p':
self.output.write("
n"[img]/images/wink.gif[/img]
elif tag=='span' or tag=='o':
pass
elif tag=='o:SmartTagType' or tag=='SmartTagType':
print "Ignore",tag
pass
else:
strattrs=""
for key, value in attrs:
if (key!='class' and key!='style' and key!='lang' and key[0:5]!='xmlns'):
strattrs = strattrs + ' %s="%s"' % (key, value)
self.output.write("<%s%s>" % (tag, strattrs))
if tag=='body':
self.output.write('n')
def unknown_endtag(self, tag):
if tag!='span' and tag!='p' and tag[0:2]!='o:':
self.output.write("%s>n" % tag)
def handle_data(self, text):
if text.strip()!='':
self.output.write(text+"n"[img]/images/wink.gif[/img]
def handle_charref(self, ref):
self.output.write("&#%s;" % ref)
def handle_entityref(self, ref):
semicolon=""
if htmlentitydefs.entitydefs.has_key(ref):
semicolon=";"
self.output.write("&%s%s" % (ref, semicolon))
if __name__=='__main__':
import sys
parser=FriendsDiet()
#fh=open(sys.argv[1], "r"[img]/images/wink.gif[/img]
fh=open("friends.htm", "r"[img]/images/wink.gif[/img]
content=fh.read()
parser.feed(content)
fh.close()
parser.close()
没有评论:
发表评论