当前位置 博文首页 > Python实现从订阅源下载图片的方法

    Python实现从订阅源下载图片的方法

    作者:saintatgod 时间:2021-07-25 18:41

    本文实例讲述了Python实现从订阅源下载图片的方法。分享给大家供大家参考。具体如下:

    这段代码是基于python 3.4实现的,和python2.X 比起来有了好多差别啊。
    这是一个练习,数据源来自网易订阅。代码如下:

    复制代码 代码如下:
    __author__ = 'Saint'
    import os
    import urllib.request
    import json
    from html.parser import HTMLParser
    # 从获取的网页内容筛选图片的内容
    class MyHtmlParser(HTMLParser):
        links = []
        def handle_starttag(self, tag, attrs):
            if tag == "img":
                if len(attrs) == 0:
                    pass
                else:
                    for name, value in attrs:
                        if name == "src":
                            self.links.append(value)
    class Down(object):
        # 总的目录
        img_path = "E:/saint"
        # 下载目录
        dir = ''
        # 采集源地址
        collect_links = ["http://dy.163.com/v2/media/articlelist/T1374483113516-1", "http://dy.163.com/v2/media/articlelist/T1420776257254-1", "http://dy.163.com/v2/media/articlelist/T1376641060407-1"]
        img_links = "http://dy.163.com/v2/article"
        def handleCollect(self):
            for collect_link in self.collect_links:
                notice = "开始从[" + collect_link + "]采集图片"
                print(notice)
                # 建立下载的目录
                dir_name = collect_link.split("/")[-1]
                self.isDirExists(dir_name)
                dict = self.getListFromSubscribe(collect_link)
                if dict == False:
                    print("数据采集失败,是否继续(y/n)")
                    op = input();
                    if op == "y":
                        os.system("cls")
                        pass
                    elif op == "n":
                        print("停止采集")
                        break
                    else:
                        os.system("cls")
                        print("非法输入")
                        break
                else:
                    for page in dict:
                        page_uri = self.img_links + "/" + page["tid"] + "/" + page["docid"]
                        self.getImgFromUri(page_uri)
                        print("是否继续(y/n)")
                        new_op = input();
                        if new_op == "n":
                            os.system("cls")
                            print("采集完毕")
                            break
            print("OK")
        # 从订阅源获取目录
        def getListFromSubscribe(self, uri):
            res = urllib.request.urlopen(uri)
            if res.code < 200 or res.code > 300:
                os.system("clear")
                return False
            else:
                result = res.read().decode("gbk") # 3.4版本的read()返回的是byte类型,需要decode()处理,选项是网页编码
                dict = json.loads(result)
                if dict['code'] != 1:
                    print(dict['msg'])
                    return False
                else:
                    return dict['data']
        # 获取本期订阅的网页,并从网页中提取出来需要的图片
        def getImgFromUri(self, uri):
            html_code = urllib.request.urlopen(uri).read().decode("gbk")
            hp = MyHtmlParser()
            hp.feed(html_code)
            hp.close()
     
            for link in hp.links: # hp.links 是图片的下载地址的列表
                self.writeToDisk(link)
        # 检查文件目录是否存在,如果不存在,则创建目录
        def isDirExists(self, dir_name):
            self.dir = self.img_path + dir_name
            isExists = os.path.exists(self.dir)
            if not isExists:
                os.makedirs(self.dir)
                return True
            else:
                return True
        # 下载文件,并且写入磁盘
        def writeToDisk(self, url):
            os.chdir(self.dir)
            file = urllib.request.urlopen(url).read()
            file_name = url.split("/")[-1]
            open(file_name, "wb").write(file)
            return True
    if __name__ == "__main__":
        down = Down()
        down.handleCollect()

    希望本文所述对大家的Python程序设计有所帮助。

    jsjbwy
    下一篇:没有了