1. lxml是个很不错的库,除了elementtree风格的XML解析之外,还支持html的解析,尤其是支持按照css选择器来查找节点。详细请参看lxml: an underappreciated web scraping library这篇文章以及lxml.html的文档(可惜文档不是太详细,得自己多试验)
2. 1742, 1709, 1705等章节取不出来(数量占19/196),第19行cssselect返回为空,具体原因还没看出来,明天再查吧
====== 8< ===================================
#!/usr/bin/python
# -*- coding: utf-8 -*-
# vim: set fileencoding=utf-8
from __future__ import with_statement
import sys
import os.path
import urllib2
import lxml.html
def parse_article(url, id):
if os.path.exists("%s.txt" % id):
return
r = urllib2.urlopen(url)
p = lxml.html.parse(r)
root = p.getroot()
content = root.cssselect("div.articleContent")
content = content[0].text_content().encode('utf-8')
print >>sys.stderr, id,
with file("%s.txt" % id, 'w') as f:
f.write(content.replace("\n\n", "\n"))
f.write("========================")
f.close()
booktitle="明朝那些事儿-历史应该可以写得好看"
def main():
for i in range(1,5): #仅第七部的链接
r = urllib2.urlopen("http://blog.sina.com.cn/s/articlelist_1233526741_4_%i.html" % i)
p = lxml.html.parse(r)
root = p.getroot()
links = root.cssselect("div.articleTitle")
for link in links:
a = link.find('a')
title = a.text.encode('utf-8') #(长篇)明朝那些事儿-历史应该可以写得好看[1752]
url = a.get('href')
if title.find(booktitle)>=0:
id = title[-5:-1] # 1752
try:
parse_article(url, id)
except:
print >>sys.stderr, "%s (Error occurred %s)" % (id, url)
if __name__=='__main__':
main()
没有评论:
发表评论