当前位置 博文首页 > 下载糗事百科的内容_python版

    下载糗事百科的内容_python版

    作者:admin 时间:2021-02-06 09:24

    复制代码 代码如下:

    #coding:utf-8

    import urllib.request
    import xml.dom.minidom
    import sqlite3
    import threading
    import time

    class logger(object):
    def log(self,*msg):
    for i in msg:
    print(i)

    Log = logger()
    Log.log('测试下')

    class downloader(object):

    def __init__(self,url):
    self.url = url

    def download(self):
    Log.log('开始下载',self.url)
    try:
    content = urllib.request.urlopen(self.url).read()
    #req = urllib.request.Request(url)
    #response = urllib.request.urlopen(req)
    #content = response.read()
    Log.log('下载完毕')
    return(content)
    except:
    Log.log('下载出错')
    return(None)


    class parser(object):

    def __init__(self,content):
    #获得根节点
    self.html = xml.dom.minidom.parseString(content)

    def parse(self):
    Log.log('开始提取数据')
    contents = {'content':'','url':[]}
    #获得div节点
    divs = self.html.getElementsByTagName('div')
    #获得content节点
    for div in divs:
    if div.hasAttribute('class') and \
    div.getAttribute('class') == 'content':
    #获得糗事百科的内容
    textNode = div.childNodes[0]
    qContent = textNode.data
    #数据填充
    contents['content'] = qContent

    #获得上一糗事、下一糗事节点
    spans = self.html.getElementsByTagName('span')
    for span in spans:
    pspan = span.parentNode
    if pspan.tagName == 'a':
    #pspan为对应的链接,此时需要将对应的地址加入数据库
    url = pspan.getAttribute('href')
    qid = url[10:][:-4]
    #数据填充
    contents['url'].append(qid)
    Log.log('提取数据完毕')
    return(contents)

    def downloadPage(qid,db):
    url = 'http://www.qiushibaike.com/articles/'+str(qid)+'.htm'
    content = downloader(url).download()
    if content:
    contents = parser(content).parse()
    if contents['content']:
    db.updateContent(qid,contents['content'])
    for i in contents['url']:
    db.addQID(i)
    if len(contents['url']) == 2:
    db.updateStatus(qid,2)

    #下载池,表示同时允许下载的链接个数
    class downloaderPool(object):
    def __init__(self,maxLength=15):
    self.downloaders = [None]*maxLength
    self.downloadList = []
    self.db = None

    def setDownloadList(self,downloadList):
    self.downloadList = list(set(self.downloadList+downloadList))

    def setdb(self,db):
    self.db = db

    def daemon(self):
    #每隔一秒查询线程的状态,为非活动线程则设置为None
    Log.log('设置守护进程')
    for index,downloader in enumerate(self.downloaders):
    if downloader:
    if not downloader.isAlive():
    Log.log('将下载器置空',index)
    self.downloaders[index] = None

    #检查线程池状态
    for index,downloader in enumerate(self.downloaders):
    if not downloader:
    qid = self.getQID()
    if qid:
    #创建线程
    t = threading.Thread(target=downloadPage,args=(qid,self.db))
    self.downloaders[index] = t
    t.start()
    t.join()
    Log.log('设置下载器',index)
    #间隔一秒执行一次
    time.sleep(1)

    def getQID(self):
    try:
    tmp = self.downloadList[0]
    del self.downloadList[0]
    return(tmp)
    except:
    return(None)

    def beginDownload(self):
    #创建守护线程
    daemon = threading.Thread(target=self.daemon)
    daemon.setDaemon(True)
    daemon.start()
    daemon.join()

    def getDownloader(self):
    for index,downloader in enumerate(self.downloaders):
    if not downloader:
    return(index)
    return(None)


    ADD_Q_ID = 'insert into qiushibaike(id,success) values(?,?)'
    UPDATE_Q_CONTENT = 'update qiushibaike set content=? where id=?'
    UPDATE_Q_STATUS = 'update qiushibaike set success=? where id=?'
    Q_LIST = 'select id from qiushibaike where success=?'
    Q_LIST_BY_ID = 'select count(*) from qiushibaike where id=?'
    class dbConnect(object):
    """
    create table qiushibaike(
    id,Integer
    content,Varchar
    success,Interger
    )
    #id表示糗事的ID
    #content表示糗事的内容
    #success表示是否下载成功,当该糗事内容下载完成,且获得上一页、下一页ID时表示下载完成
    1表示未完成
    2表示完成
    """
    def __init__(self,dbpath='db.sqlite'):
    self.dbpath = dbpath

    def addQID(self,qid):
    Log.log('插入糗事百科',qid)
    #获得连接
    cn = sqlite3.connect(self.dbpath)
    c = cn.cursor()

    try:
    #添加内容并提交
    c.execute(ADD_Q_ID,(qid,1))
    cn.commit()
    except:
    Log.log('添加ID出错',qid)

    #关闭连接
    c.close()

    cn.close()
    Log.log('插入成功')

    def updateContent(self,qid,content):
    Log.log('更新糗事百科',qid,content)
    #获得连接
    cn = sqlite3.connect(self.dbpath)
    c = cn.cursor()
    #添加内容并提交
    c.execute(UPDATE_Q_CONTENT,(content,qid))
    cn.commit()
    #关闭连接
    c.close()
    cn.close()
    Log.log('更新成功')

    def updateStatus(self,qid,flag):
    Log.log('更新状态',qid,flag)
    #获得连接
    cn = sqlite3.connect(self.dbpath)
    c = cn.cursor()
    #添加内容并提交
    c.execute(UPDATE_Q_STATUS,(flag,qid))
    cn.commit()
    #关闭连接
    c.close()
    cn.close()
    Log.log('更新状态成功')

    def getList(self,unDonloaded=1):
    Log.log('获得列表')
    l = []
    #获得连接
    cn = sqlite3.connect(self.dbpath)
    c = cn.cursor()
    #获得数据
    c.execute(Q_LIST,(unDonloaded,))
    rows = c.fetchall()

    for i in rows:
    l.append(i[0])
    #关闭连接
    c.close()
    cn.close()

    Log.log('获得列表成功')
    return(l)

    class singleDownloader(object):
    def __init__(self):
    self.downloadList = []

    def setdb(self,db):
    self.db = db

    def setDownloadList(self,downloadList):
    self.downloadList = list(set(self.downloadList+downloadList))

    def beginDownload(self):
    for i in self.downloadList:
    downloadPage(i,self.db)

    def main():
    db = dbConnect('db.sqlite')
    #dp = downloaderPool()
    #dp.setdb(db)
    sp = singleDownloader()
    sp.setdb(db)

    dp=sp

    unDownloadedList = db.getList()
    #当还有未下载的糗事时就要继续下载
    while(len(unDownloadedList)):
    #使用该列表填充下载池
    dp.setDownloadList(unDownloadedList)

    dp.beginDownload()

    time.sleep(1)
    #重置参数
    unDownloadedList = db.getList()

    if __name__ == '__main__':
    main()


    代码是没问题的,可以正常运行,但是希望做到以下2方面:
    1、多线程下载
    2、代码分离度更高,跟面向对象 js
    下一篇:没有了