Skynet

---------- ---------- 我的新 blog : liukaiyi.cublog.cn ---------- ----------

  BlogJava :: 首页 :: 联系 :: 聚合  :: 管理
  112 Posts :: 1 Stories :: 49 Comments :: 0 Trackbacks



#关键字
   python,pycurl, SGMLParser ,sqlite3, 抓取 , 增量计算,encodeuri 转换,crontab ,mail
#参考
  sqlite3  http://linuxgazette.net/109/chirico1.html
  SGMLParser http://www.woodpecker.org.cn/diveintopython/html_processing/index.html




提取 python
import pycurl
from sgmllib import SGMLParser
import re
from urllib import quote, unquote 

#使用 SGMLParser(html 分析) 类继承
#
详细请查看
#
   http://www.woodpecker.org.cn/diveintopython/html_processing/index.html
class BaiduTop_GMLParser(SGMLParser):
    
def reset(self, verbose=0):
        SGMLParser.reset(self)
        self.data
=[]
        self.a 
= None
        
    
def start_a(self, attrs):
        href 
= [v for k, v in attrs if k=='href']
        rsc
=re.search('word=(.*)\+(.*)', href[0] )    
        
if href and rsc :
           
#baidu 页面编码为 gbk ,并且中文 encodeuri 了
            #此转换为 utf8
            music=unquote(rsc.group(1)).decode('gbk').encode('utf8')
            actors
=unquote(rsc.group(2)).decode('gbk').encode('utf8')
            self.data.append((actors,music))
            self.a
=True
    
def getData(self):
        
return self.data
    
def __init__(self):
        self.reset()
        c 
= pycurl.Curl()
        c.setopt(pycurl.URL, 
'http://list.mp3.baidu.com/topso/mp3topsong.html?id=1?top2')
        
import StringIO
        b 
= StringIO.StringIO()
        c.setopt(pycurl.WRITEFUNCTION, b.write)
        c.setopt(pycurl.FOLLOWLOCATION, 
1)
        c.setopt(pycurl.MAXREDIRS, 
5)
        
#c.setopt(pycurl.PROXY, 'http://11.11.11.11:8080')
        #c.setopt(pycurl.PROXYUSERPWD, 'aaa:aaa')
        c.perform()
        self.feed(b.getvalue())



使用 py
#!python      
#
 -*- coding: UTF8 -*-
'''
新添加入 top  
当天全量
没有 歌手名

退出 top

drop table baidu_Top ;
create table baidu_Top (
  id integer auto_increment  PRIMARY KEY ,
  actor varchar(300) ,
  music varchar(300) ,
  createTime DATE
);

'''


import sqlite3,os,sys
import datetime, calendar  
import pdb

from baiduTop500 import BaiduTop_GMLParser
class Action():
    
def __init__(self,conn,data):
        self.conn 
= conn 
        self.data 
= data 
        self.allData 
= []
        self.newData 
= []
        self.newNotActorData
=[]
        self.allNotActorData
=[]

    
def insertAll(self): 
        insertSql 
= "insert into baidu_Top (actor,music,createTime) values (?,?,date()) ;"
        isSql 
= "select music from baidu_Top where actor=? and music=? and createTime=strftime('%Y-%m-%d',?) ;"
        isSql2 
= "select music from baidu_Top where actor=? and music=? and createTime=strftime('%Y-%m-%d',?) ;"
        cur 
= self.conn.cursor()
        
for actor,music in self.data :
            
try :
                cur.execute(isSql,(actor,music,datetime.date.today())  )
                res 
= cur.fetchall()
                
if not res :
                    cur.execute(insertSql,(actor,music)  )
                    self.allData.append( (actor,music) )
                    
if actor=="" or actor==" " :
                        self.allNotActorData.append((actor,music))
                    conn.commit()
                todate
=(datetime.date.today()-datetime.timedelta(days=1))
                cur.execute(isSql2,(actor,music,todate)  ) 
                
if not cur.fetchall() :
                    self.newData.append( (actor,music) )
                    
if actor=="" or actor==" " :
                        self.newNotActorData.append((actor,music))
                    conn.commit()
            
except Exception, myError:
                excType, excValue, traceBack 
= sys.exc_info()
                
print excType
                
print myError
        
try :
            cur.close()
        
except:
            
pass
            
    

def pfor(title,data):
    
for a,m in data :
        
print "%s\t%s\t%s" %(title,a,m)
def line():
    
print
    
print "___________________________________________________________________________________"
    
print "___________________________________________________________________________________"
    
print 

if   __name__  ==  "__main__":
    
try:
        conn 
= sqlite3.connect("/home/xj_liukaiyi/src/python/baidu_top/ex500")
        ac 
= Action(conn,BaiduTop_GMLParser().getData())
        ac.insertAll()
        
#ac.insertNewByDate()
        
        
print '''说明 %s : 
            new 对比前一天新添加
            new not actor 对比前一天新增加但没歌手名
            all 当天top 500 展现全部 
            all not actor 当天 top 500 展现全部全但没歌手 
''' %(datetime.date.today())
        line()
        pfor(
"new",ac.newData)
        line()
        pfor(
"new not actor",ac.newNotActorData)
        line()
        pfor(
"all",ac.allData)
        line()
        pfor(
"all not actor",ac.allNotActorData)
    
finally:
        conn.close()



再通过系统
crontab -e


邮箱 gbk 转码 ,后发送 。每天早上 5点
0 5 * * * /usr/local/bin/python /home/xj_liukaiyi/src/python/baidu_top/Action.py|perl -MEncode -ne 'print encode("GBK", decode("UTF-8",$_));' > tmp ; mail -s "baidu Top 500"  liukaiyi@gmail.com  < tmp;



整理 www.blogjava.net/Good-Game
posted on 2009-05-06 09:47 刘凯毅 阅读(1522) 评论(0)  编辑  收藏 所属分类: python

只有注册用户登录后才能发表评论。


网站导航: