当前位置 博文首页 > LY的博客:代理IP 多线程 伪造表头 爬虫小框架

    LY的博客:代理IP 多线程 伪造表头 爬虫小框架

    作者:[db:作者] 时间:2021-08-02 12:44

    翻到一个两年前写的爬虫小框架

    # coding=utf-8
    
    import tushare as ts
    import pandas as pd
    import requests
    import json
    import re
    import time
    from retrying import retry
    from concurrent.futures import ThreadPoolExecutor
    import random
    
    def get_pro():
        list = ['122.114.31.177:808', '61.135.217.7:80', '113.121.243.109:808', '171.39.40.5:8123', '121.31.199.30:8123',
                '111.155.116.240:8123', '125.121.121.171:808', '115.213.178.192:808']
    
        return list
    
    
    start = time.clock()  # 计时-开始
    
    urlnum = range(8)
    listdo = urlnum
    
    
    while True:
        listye = []
        listno = []
        event = []
        @retry(stop_max_attempt_number=8)  # 设置最大重试次数
        def crawl(n):
    
            pro_list = get_pro()
    
            header = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64)'}
    
            proxies_l = {'http': pro_list[random.randint(0, len(pro_list))],
    
                         }
            print(proxies_l['http'])
    
            try:
                req = requests.get('http://httpbin.org/ip', headers=header, proxies=proxies_l)
                print('finish')
                listye.append(n)
                listdo.remove(n)
                print (listdo)
    
                return  req.text
    
            except:
                print('no proxies')
                listno.append(n)
    
        # 多线程
        def multithreading():
    
            number = listdo
    
            with ThreadPoolExecutor(max_workers=10) as executor:
                for result in executor.map(crawl, number, chunksize=10):
                    event.append(result)
    
            return event
    
    
        event = multithreading()
        print ('listye')
        print (listye)
        print ('listno')
        print (listno)
        print ('listdo')
        print (listdo)
    
    
    
    
        if len(listdo) == 0:
            break
    
    end = time.clock()  # 计时-结束
    print ("爬取完成 用时:")
    print (end - start)

    ?

    cs