当前位置 主页 > 网站技术 > 代码类 >

    Python爬虫实现的根据分类爬取豆瓣电影信息功能示例

    栏目:代码类 时间:2019-09-15 14:07

    本文实例讲述了Python爬虫实现的根据分类爬取豆瓣电影信息功能。分享给大家供大家参考,具体如下:

    代码的入口:

    if __name__ == '__main__':  main()
    #! /usr/bin/python3# -*- coding:utf-8 -*-# author:Sirius.Zhaoimport jsonfrom urllib.parse import quotefrom urllib.request import urlopenfrom urllib.request import Requestimport pymysqlimport requestsfrom bs4 import BeautifulSoupimport sysimport datetimeimport timefrom imp import reloadimport randomdef LoadUserAgents(uafile):  """  uafile : string    path to text file of user agents, one per line  """  uas = []  with open(uafile, 'rb') as uaf:    for ua in uaf.readlines():      if ua:        uas.append(ua.strip()[1:-1 - 1])  random.shuffle(uas)  return uasuas = LoadUserAgents("user_agents.txt")# s = {}# for i in range(3):#   s["key"] = [1,i,]#   print(s)# print(s)#所有的电影,去重dict_movies = {}def datetime_to_timestamp_in_milliseconds(d):  def current_milli_time(): return int(round(time.time() * 1000))  return current_milli_time()reload(sys)# 通过下面的网址获取分类列表# https://movie.douban.com/chart# 根据分类和比例获取相应的电影# https://movie.douban.com/typerank?type_name=%E5%96%9C%E5%89%A7&type=24&interval_id=100:90#定义一个比例的列表percent_list = ['100:90','90:80','80:70','70:60','60:50','50:40','40:30','30:20','20:10','10:0']#获取分类列表def find_iterm(url):  response = urlopen(url)  bs = BeautifulSoup(response,'html.parser')  iterms = bs.select('div.types span a')  iterms_href = [iterm.get('href') for iterm in iterms]  iterms_list = [iterm.text for iterm in iterms]  lists = [iterms_list,iterms_href]  return lists# Accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8# Accept-Encoding:gzip, deflate, br# Accept-Language:zh-CN,zh;q=0.9# Connection:keep-alive# Cookie:bid=mMrd75oQWFA; __utmc=30149280; __utmc=223695111; __yadk_uid=TsnvvnzAl9l5hXsJExLg5PkZQD8tW2xu; ll="108288"; _vwo_uuid_v2=DA5ED1377260F937BEC8CBD3785E44E53|98ebf520a520de4c9c6b9bed6d211cd7; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1522309082%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DR23_MHR8K3SFj2J4gH-0n2G67VhfRtaG8GFHstysqjnPZ_HxqpDmGX54pQSSCCCd%26wd%3D%26eqid%3Dde9da0fa00002a7f000000035abc9802%22%5D; _pk_ses.100001.4cf6=*; __utma=30149280.65574578.1521358273.1522244587.1522309083.7; __utmz=30149280.1522309083.7.7.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utma=223695111.505210566.1522198584.1522244587.1522309083.3; __utmb=223695111.0.10.1522309083; __utmz=223695111.1522309083.3.3.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmt=1; __utmb=30149280.1.10.1522309083; _pk_id.100001.4cf6=c6e6b98e6f177261.1522198584.3.1522309214.1522248302.# Host:movie.douban.com# Referer:https://movie.douban.com/chart# Upgrade-Insecure-Requests:1# User-Agent:Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36#获取某个阶段总的电影数目(100:90....)def find_total_num(suffix,head):  link_total = "https://movie.douban.com/j/chart/top_list_count?type="+suffix  print(link_total)  req = Request(link_total, headers=head)  total_data = urlopen(req)  total_num = json.load(total_data)['total']  return total_numdef insert_into_mysql(dict_movies):  con = pymysql.connect(host="localhost", user="root", password="root", database="douban", charset='utf8', port=3306)  cursor = con.cursor()  print(dict_movies)  sql_insert = "INSERT INTO `douban`.`movies` (`rating`, `title`, `release_date`, `regions`, `types`, `actors`, `vote_count`, `score`, `rank`) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)"  for key,record in dict_movies.items():    cursor.execute(sql_insert, record)  con.commit()  cursor.close()  con.close()#获取电影# 获取totalNum# https://movie.douban.com/j/chart/top_list_count?type=24&interval_id=100:90# {"playable_count":232,"total":431,"unwatched_count":431}# 获取电影信息# https://movie.douban.com/j/chart/top_list?type=11&interval_id=100%3A90&start=0&limit=562def find_allMovie_iterm(category,href):  # print(category, href)  ua = random.choice(uas)  head = {    'User-Agent': ua,    'Cookie': 'bid=mMrd75oQWFA; __utmc=30149280; __utmc=223695111; __yadk_uid=TsnvvnzAl9l5hXsJExLg5PkZQD8tW2xu; ll="108288"; _vwo_uuid_v2=DA5ED1377260F937BEC8CBD3785E44E53|98ebf520a520de4c9c6b9bed6d211cd7; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1522309082%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DR23_MHR8K3SFj2J4gH-0n2G67VhfRtaG8GFHstysqjnPZ_HxqpDmGX54pQSSCCCd%26wd%3D%26eqid%3Dde9da0fa00002a7f000000035abc9802%22%5D; _pk_ses.100001.4cf6=*; __utma=30149280.65574578.1521358273.1522244587.1522309083.7; __utmz=30149280.1522309083.7.7.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utma=223695111.505210566.1522198584.1522244587.1522309083.3; __utmb=223695111.0.10.1522309083; __utmz=223695111.1522309083.3.3.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmt=1; __utmb=30149280.1.10.1522309083; _pk_id.100001.4cf6=c6e6b98e6f177261.1522198584.3.1522309214.1522248302.',    'Referer': 'https://movie.douban.com/chart'  }  # /typerank?type_name=%E5%96%9C%E5%89%A7&type=24&interval_id=  suffix = href.split('&type=')[1]  link_movies = "https://movie.douban.com/j/chart/top_list?type="+ suffix  # 获取数据  # {  # "rating":["8.2","45"],  # "rank":391,  # "types":["喜剧","犯罪","爱情"],  # "regions":["美国"],  # "title":"天堂里的烦恼",  # "release_date":"1932-10-21",  # "vote_count":1868,  # "score":"8.2",  # "actors":["米利亚姆·霍普金斯","凯·弗朗西斯","赫伯特·马歇尔","查尔斯·拉格尔斯","爱德华·艾沃瑞特·霍顿"],  # },  #去重使用  for stage in percent_list:    time.sleep(2)    suffix_total = suffix + stage    total = find_total_num(suffix_total,head)    url_movies = link_movies + stage + '&start=0&limit=' + str(total)    # 解析每次获取的json串,形成record 考虑去重    req = Request(url_movies, headers=head)    movies_list = urlopen(req)    # 得到一个[{},{},{}]类型的json串    movies_list = json.load(movies_list)    print(movies_list)    for movie in movies_list:      rating = str(int(movie['rating'][1]) / 10)      title = movie['title']      release_date = movie['release_date']      regions = ','.join(movie['regions'])      types = ','.join(movie['types'])      actors = ','.join(movie['actors'])      vote_count = movie['vote_count']      score = movie['score']      rank = movie['rank']      dict_movies[title+release_date] = [rating,title,release_date,regions,types,actors,vote_count,score,rank]def main():  url = 'https://movie.douban.com/chart'  #获取分类列表  iterms = find_iterm(url)  # 获取每个分类的所有的电影 传递:分类,分类href  for i in range(len(iterms[1])):      find_allMovie_iterm(iterms[0][i],iterms[1][i].split('100:90')[0])  # 将一个分类的电影插入到数据库中  insert_into_mysql(dict_movies)if __name__ == '__main__':  main()