当前位置 博文首页 > python实现爬虫下载漫画示例

    python实现爬虫下载漫画示例

    作者:admin 时间:2021-06-22 18:27

    复制代码 代码如下:

    #!/usr/bin/python3.2
    import os,socket
    import urllib
    import urllib.request,threading,time
    import re,sys
    global manhuaweb,weburl,floder,chapterbegin,currentthreadnum,threadcount,mutex,mutex2

    weburl=''
    floder=''
    chapterbegin=0
    currentthreadnum=0
    threadcount=6


    if len(sys.argv)>=3:
      weburl=sys.argv[1]
      floder=sys.argv[2]
    else:
        print("usag: downloadmanhua weburl floder chapterbegin=0 threadnnum=6")
        sys.exit(0)
    if len(sys.argv)>=4:
      chapterbegin=int(sys.argv[3])
    if len(sys.argv)>=5:
      threadcount=(int)(sys.argv[4])

     

    def jin(i,jinzhi):
            finalans=""
            answer=i%jinzhi
            i=int(i/jinzhi)
            if answer>9:
                    finalans=finalans+chr(ord('a')+(answer-10))
            else:
                    finalans=finalans+str(answer)
            if i!=0:
                    finalans=jin(i,jinzhi)+finalans
            return finalans
    def urlparse(p,a,c,k):
            d={}
            e=lambda c:     jin(c,36)
            if 1:
                    while c:
                            c=c-1
                            if not k[c]:
                                    d[jin(c,36)]=jin(c,36)
                            else:
                                    d[jin(c,36)]=k[c]
                    k=[lambda e:d[e]]
                    e=lambda c:'\\w+'
                    c=1
            newstr=""
            while c:
                    c=c-1
                    if k[c]:
                            for i in range(0,len(p)):
                                    tempi=p[i]
                                    tempi=ord(tempi)
                                    if tempi>=ord('a') and tempi<=ord('f'):
                                            newstr+=d[chr(tempi)]
                                    elif tempi>=ord('0') and tempi<=ord('9'):
                                            newstr+=d[chr(tempi)]
                                    else:
                                            newstr+=chr(tempi)
            return newstr
    def meispower(s):
            p=re.compile(r"(?=\}\().*",re.IGNORECASE)
            s=p.findall(s)
            s=s[0]
            s=s[0:(len(s)-19)]
            par=s.split(',')
            par[3]=par[3][1:len(par[3])]
            answer=par[3].split('|')
            chapterpath=urlparse(par[0],int(par[1]),int(par[2]),answer)
            allurl=re.findall('imgpath=[^;]*',chapterpath)[0]
            allurl=allurl[10:(len(allurl)-2)]
            return allurl
    def pictofile(weburl,filename,loop=100):
            if loop<0:
                    print('can\'t download the picture %s'%weburl)
                    return
            loop=loop-1
            if os.path.exists(filename):
                return
            try:
                    url=urllib.request.urlopen(weburl)
                    data=url.read()
                    if len(data)<2048:
                            url.close()
                            pictofile(weburl,filename,loop)
                    else:
                            print('download from %s name is %s\n'%(weburl,filename))
                            myfile=open('%s'%filename,'wb')
                            myfile.write(data)
                            myfile.close()
                            url.close();
            except socket.timeout:
                    print('timeout')
                    pictofile(weburl,filename,loop)
            except Exception as e:
              print('error',e)
              pictofile(weburl,filename,loop)
            finally:
                pass
    def downloadpic(url,loadpicdir,num):
        #download the all url picture to loadpicdir
        global currentthreadnum,mutex,mutex2
        mymode=re.compile(r'[0-9a-z.]*\Z')
        try:
                    mutex2.acquire()
                    os.chdir(loadpicdir)
                    mutex2.release()
        except:
                    print("can't open the floder %s will be create"%loadpicdir)
                    try:
                        if(mutex2.locked()):
                            os.mkdir(loadpicdir)
                            os.chdir(loadpicdir)
                            mutex2.release()
                        print('create floder succeed')
                    except:
                        print("can't create floder %s"%loadpicdir)
                        if(mutex.acquire()):
                            mutex.release()
                        quit(0)
        name=mymode.findall(url)
        filename='manhua'+name[0]
        pictofile(url,loadpicdir+'//'+str(num)+'-'+filename)
        mutex.acquire()
        currentthreadnum=currentthreadnum-1
        mutex.release()
    def downloadchapter(url,loadpicdir,num,begin=0):
            global manhuaweb,threadcount,currentthreadnum,mutex
            print(manhuaweb+url)
            webdata=urllib.request.urlopen(manhuaweb+url).read()
            webdata=webdata.decode('UTF-8')
            chaptername=re.findall(r'<title>[^_]*',webdata)[0]
            chaptername=chaptername[7:len(chaptername)]
            webscrip=re.findall(r'eval.*[^<>]',webdata)
            chapterurl=meispower(webscrip[0]);
            chapterurl='http://mhimg.ali213.net'+chapterurl
            for i in range(begin,num):
                    try:
                            while(currentthreadnum>=threadcount):
                                    time.sleep(0.5)
                            mutex.acquire()
                            currentthreadnum=currentthreadnum+1
                            mutex.release()
                            threading.Thread(target=downloadpic,args=(r'%s%d.jpg'%(chapterurl,i),loadpicdir+chaptername,num)).start()
                    except socket.error:
                            mutex.acquire()
                            i=i-1
                            currentthreadnum=currentthreadnum-1
                            mutex.release()
                    except Exception as error:
                            print(error,'break')
                            print('download chapter %d of picture make a error'%i)
                            break
    if __name__=='__main__':
            manhuaweb=r'http://manhua.ali213.net'
            socket.setdefaulttimeout(60.0)
            mutex=threading.Lock()
            mutex2=threading.Lock()

           
            webfile=urllib.request.urlopen(weburl)
            webdata=webfile.read();
            webdata=webdata.decode('UTF-8')
            meshmode=re.compile(r'<div class="detail_body_right_sec_con">.*</div>')
            meshdata=meshmode.findall(webdata)[0]
            indexmode=re.compile(r'([0-9]*页)')
            indexdata=indexmode.findall(meshdata)

            picurlmode=re.compile(r'/comic/[0-9/]*.html')
            picurldata=picurlmode.findall(meshdata)


            chapterlength=len(picurldata)
            nummode=re.compile(r'[\d]+')

            i=chapterbegin
            while i<chapterlength:
                    manhuachapter=picurldata[chapterlength-i-1]
                    downloadchapter(manhuachapter,floder,int(nummode.findall(indexdata[chapterlength-i-1])[0]))
                    i=i+1

    js
    下一篇:没有了