#关键字
   python,pycurl, SGMLParser ,sqlite3, 抓取 , 增量计算,encodeuri 转换,crontab ,mail 
#参考
  sqlite3  http://linuxgazette.net/109/chirico1.html
  SGMLParser 
http://www.woodpecker.org.cn/diveintopython/html_processing/index.html
提取 python 
import pycurl
from sgmllib import SGMLParser
import re
from urllib import quote, unquote 
#使用 SGMLParser(html 分析) 类继承
#详细请查看
#   http://www.woodpecker.org.cn/diveintopython/html_processing/index.html
class BaiduTop_GMLParser(SGMLParser):
    def reset(self, verbose=0):
        SGMLParser.reset(self)
        self.data=[]
        self.a = None
        
    def start_a(self, attrs):
        href = [v for k, v in attrs if k=='href']
        rsc=re.search('word=(.*)\+(.*)', href[0] )    
        if href and rsc :
            #baidu 页面编码为 gbk ,并且中文 encodeuri 了
            #此转换为 utf8
            music=unquote(rsc.group(1)).decode('gbk').encode('utf8')
            actors=unquote(rsc.group(2)).decode('gbk').encode('utf8')
            self.data.append((actors,music))
            self.a=True
    def getData(self):
        return self.data
    def __init__(self):
        self.reset()
        c = pycurl.Curl()
        c.setopt(pycurl.URL, 'http://list.mp3.baidu.com/topso/mp3topsong.html?id=1?top2')
        import StringIO
        b = StringIO.StringIO()
        c.setopt(pycurl.WRITEFUNCTION, b.write)
        c.setopt(pycurl.FOLLOWLOCATION, 1)
        c.setopt(pycurl.MAXREDIRS, 5)
        #c.setopt(pycurl.PROXY, 'http://11.11.11.11:8080')
        #c.setopt(pycurl.PROXYUSERPWD, 'aaa:aaa')
        c.perform()
        self.feed(b.getvalue())
使用 py
#!python      
# -*- coding: UTF8 -*-
'''
新添加入 top  
当天全量
没有 歌手名
退出 top
drop table baidu_Top ;
create table baidu_Top (
  id integer auto_increment  PRIMARY KEY ,
  actor varchar(300) ,
  music varchar(300) ,
  createTime DATE
);
'''
import sqlite3,os,sys
import datetime, calendar  
import pdb
from baiduTop500 import BaiduTop_GMLParser
class Action():
    def __init__(self,conn,data):
        self.conn = conn 
        self.data = data 
        self.allData = []
        self.newData = []
        self.newNotActorData=[]
        self.allNotActorData=[]
    def insertAll(self): 
        insertSql = "insert into baidu_Top (actor,music,createTime) values (?,?,date()) ;"
        isSql = "select music from baidu_Top where actor=? and music=? and createTime=strftime('%Y-%m-%d',?) ;"
        isSql2 = "select music from baidu_Top where actor=? and music=? and createTime=strftime('%Y-%m-%d',?) ;"
        cur = self.conn.cursor()
        for actor,music in self.data :
            try :
                cur.execute(isSql,(actor,music,datetime.date.today())  )
                res = cur.fetchall()
                if not res :
                    cur.execute(insertSql,(actor,music)  )
                    self.allData.append( (actor,music) )
                    if actor=="" or actor==" " :
                        self.allNotActorData.append((actor,music))
                    conn.commit()
                todate=(datetime.date.today()-datetime.timedelta(days=1))
                cur.execute(isSql2,(actor,music,todate)  ) 
                if not cur.fetchall() :
                    self.newData.append( (actor,music) )
                    if actor=="" or actor==" " :
                        self.newNotActorData.append((actor,music))
                    conn.commit()
            except Exception, myError:
                excType, excValue, traceBack = sys.exc_info()
                print excType
                print myError
        try :
            cur.close()
        except:
            pass
            
    
def pfor(title,data):
    for a,m in data :
        print "%s\t%s\t%s" %(title,a,m)
def line():
    print
    print "___________________________________________________________________________________"
    print "___________________________________________________________________________________"
    print 
if   __name__  ==  "__main__":
    try:
        conn = sqlite3.connect("/home/xj_liukaiyi/src/python/baidu_top/ex500")
        ac = Action(conn,BaiduTop_GMLParser().getData())
        ac.insertAll()
        #ac.insertNewByDate()
        
        print '''说明 %s : 
            new 对比前一天新添加
            new not actor 对比前一天新增加但没歌手名
            all 当天top 500 展现全部 
            all not actor 当天 top 500 展现全部全但没歌手 ''' %(datetime.date.today())
        line()
        pfor("new",ac.newData)
        line()
        pfor("new not actor",ac.newNotActorData)
        line()
        pfor("all",ac.allData)
        line()
        pfor("all not actor",ac.allNotActorData)
    finally:
        conn.close()
再通过系统 
crontab -e
邮箱 gbk 转码 ,后发送 。每天早上 5点 
0 5 * * * /usr/local/bin/python /home/xj_liukaiyi/src/python/baidu_top/Action.py|perl -MEncode -ne 'print encode("GBK", decode("UTF-8",$_));' > tmp ; mail -s "baidu Top 500"  liukaiyi@gmail.com  < tmp;
整理 www.blogjava.net/Good-Game